diff --git a/server/spacyface/.gitignore b/server/spacyface/.gitignore
deleted file mode 100644
index d45d5eb51cb6ae5dfee3f239e75ffbdf19dbc218..0000000000000000000000000000000000000000
--- a/server/spacyface/.gitignore
+++ /dev/null
@@ -1,22 +0,0 @@
-.DS_Store
-
-# Emacs specific
-*~
-.*~
-#*
-.#*
-
-# Python
-__pycache__
-*.egg-info
-.ipynb_checkpoints
-.pytest_cache
-
-# For holding files
-.archive
-
-# Notebooks that are not titled
-Untitled*.ipynb
-
-# For distribution on pip
-dist
\ No newline at end of file
diff --git a/server/spacyface/.gitrepo b/server/spacyface/.gitrepo
deleted file mode 100644
index 9f6b6ad29609eb5cdbc6decd9f3cd94714b65f60..0000000000000000000000000000000000000000
--- a/server/spacyface/.gitrepo
+++ /dev/null
@@ -1,12 +0,0 @@
-; DO NOT EDIT (unless you know what you are doing)
-;
-; This subdirectory is a git "subrepo", and this file is maintained by the
-; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
-;
-[subrepo]
-	remote = git@github.com:bhoov/spacyface-dev.git
-	branch = master
-	commit = 1b1df2cb074876bc5a64f934f2aa2c6822d219c0
-	parent = 358151dfadb49f44770bb8e031bcb7e586ff858e
-	method = merge
-	cmdver = 0.4.1
diff --git a/server/spacyface/LICENSE b/server/spacyface/LICENSE
deleted file mode 100644
index bde4440d87d26933ef15f1e7d9428d4791502b9b..0000000000000000000000000000000000000000
--- a/server/spacyface/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2020 Benjamin Hoover
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/server/spacyface/README.md b/server/spacyface/README.md
deleted file mode 100644
index 22efabbeec15984caa829f8b3f86f71fd2819320..0000000000000000000000000000000000000000
--- a/server/spacyface/README.md
+++ /dev/null
@@ -1,136 +0,0 @@
-
-# Spacyface aligner
-
-Align [Huggingface Transformer](https://github.com/huggingface/transformers) model tokenizations with linguistic metadata provided by [spaCy](https://spacy.io/)!
-
-*Currently only supports English tokenizations*
-
-## Getting started
-
-### Pip
-1. Run `pip install spacyface`.
-2. `python -m spacy download en_core_web_sm`
-
-### Manual (Clone and conda)
-1. From the root of this project, create a new conda directory with `conda env create -f environment.yml`. This will create an environment named `spacyface`.
-2. Activate this environment with `conda activate spacyface`. At this point, if you want to install the development dependencies, you can do so with `conda env update -f environment-dev.yml`
-3. You will need to install spacy's `en_core_web_sm` as well. To do this, run: `python -m spacy download en_core_web_smo`
-
-## Usage
-### Basic Usage on a sentence
-Every aligner can be created and used as described in the example below:
-
-``` python
-from aligner import BertAligner
-
-alnr = BertAligner.from_pretrained("bert-base-cased")
-sentence = "Do you know why they call me the Count? Because I love to count! Ah-hah-hah!"
-tokens = alnr.meta_tokenize(sentence)
-print("Tokens:\n\n", [(tok.token, tok.pos) for tok in tokens])
-```
-
-```
-Tokens:
-
-   [('Do', 'AUX'), ('you', 'PRON'), ('know', 'VERB'), ('why', 'ADV'), ('they', 'PRON'), ('call', 'VERB'), ('me', 'PRON'), ('the', 'DET'), ('Count', 'PROPN'), ('?', 'PUNCT'), ('Because', 'SCONJ'), ('I', 'PRON'), ('love', 'VERB'), ('to', 'PART'), ('count', 'VERB'), ('!', 'PUNCT'), ('Ah', 'INTJ'), ('-', 'PUNCT'), ('ha', 'X'), ('##h', 'X'), ('-', 'PUNCT'), ('ha', 'NOUN'), ('##h', 'NOUN'), ('!', 'PUNCT')]
-```
-
-Because the information is coming directly from spaCy's `Token` class, any information that spaCy exposes about a token can be included in the huggingface token. The user only needs to modify the exposed attributes in the [SimpleSpacyToken](./aligners/simple_spacy_token) class.
-
-This can also be extrapolated to tokenize entire English corpora with the use of a generator. An example raw corpus representing a subset of wikipedia is included in the [[./tests]] directory.
-
-### Observing attention between linguistic features
-This library also enables us to look at the attention pattern heatmaps for a particular layer and a particular head in terms of the linguistic features that belong to that layer and head.
-
-``` python
-alnr_cls = RobertaAligner
-model_name = "roberta-base"
-sentence = "A simple sentence for the ages."
-layer = 8
-heads = [7]
-
-alnr = alnr_cls.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name, output_attentions=True)
-model.eval() # Remove DropOut effect
-
-model_input, meta_info = alnr.sentence_to_input(sentence)
-
-_, _, atts = model(**model_input)
-
-to_show = atts[layer][0][heads].mean(0)[1:-1, 1:-1] # Don't show special tokens for Roberta Model
-
-deps = [t.dep for t in meta_info[1:-1]]
-poss = [t.pos for t in meta_info[1:-1]]
-
-plt.figure()
-sn.set(font_scale=1.5)
-sn.heatmap(to_show.detach().numpy(), xticklabels=deps, yticklabels=deps)
-plt.title(f"Layer {layer} for head(s): {heads}\n\"{sentence}\"")
-```
-
-![Attention heatmap Layer 8 head 7](./img/SampleHeatmap.png)
-
-Interestingly, we have discovered that Layer 8, head 7 has a strong affinity for a POBJ (Object of the Preposition) looking at a PREP (Preposition). Cool! We can then test this hypothesis by running example sentences that have multiple prepositions to see if it is looking at all prepositions or just the preposition related to the object.
-
-
-## Background
-Different transformer models use different tokenizations. At the time of this writing, many these tokenizations split larger English words into smaller tokens and use different methods of indicating that a token was once part of a larger word.
-
-For inspection and research, it is helpful to align these tokenizations with the linguistic features of the original words of the sentence. [spaCy](https://spacy.io/) is a fantastic python library for assigning linguistic features (e.g., dependencies, parts of speech, tags, exceptions) to the words of different languages, but its method for tokenizing is vastly different from the tokenization schemes that typically operate on the sub-word and sometimes byte level. This repository aims to align spaCy tokens with the sub-word tokens needed for training and inference of the different [Huggingface Transformer](https://github.com/huggingface/transformers) models.
-
-In short, *this repository enables the strange and varied tokenizations belonging to different transformer models to be correctly annotated with the metadata returned by spaCy's tokenization.*
-
-Currently, the repository only supports the English language and the following huggingface pretrained models:
-
-- Bert
-- GPT2 (covers distilgpt2)
-- Roberta (covers distilroberta)
-- DistilBert
-- TransfoXL
-- XLNet
-- XLM
-- Albert
-- CTRL
-- OpenAIGPT
-- XLMRoberta
-
-At the time of release, the only model that doesn't work with the alignment is the T5 Tokenization scheme.
-
-Originally created to ease the development of [exBERT](http://exbert.net/), these tools have been made available for others to use in their own projects as they see fit.
-
-## Testing the aligner
-A few edge case sentences that include hardcoded exceptions to the English language as well as strange punctuation have been included in [EN_TEST_SENTS.py](./tests/EN_TEST_SENTS.py). You can run these tests on the established aligners with `python -m pytest` from the root folder.
-
-Sometimes, your application may not care about edge cases that are hard to detect. You can test an alignment on a more representative subset of the English language with the included [wikipedia subset](./tests/wiki.test.txt), or use your own text file corpus. To do this, run
-
-``` python
-from spacyface import TransfoXLAligner
-from spacyface.checker import check_against_corpus
-corpus = 'tests/wiki.test.txt'
-alnr = TransfoXLAligner.from_pretrained('transfo-xl-wt103')
-check_against_corpus(alnr, corpus)
-```
-
-and wait a few minutes to see if any sentences break.
-
-## Notable Behavior and Exceptions
-This repository makes the large assumption that there is no English "word" which is smaller than a token needed for a transformer model. This is an accurate assumption for most of the published transformer models.
-
-It is difficult to align such completely different tokenization schemes. Namely, there are a few strange behaviors that, while not desired, are intentional to create a simplified methods to aligned different tokenization schemes. These behaviors are listed below.
-
-- Multiple consecutive spaces in a sentence are replaced with a single space.
-- Many tokenizers insert special tokens (e.g., "[CLS]", "[SEP]", "[MASK]", "\<s\>") for certain functionalities. The metadata for all these tokens is assigned to `None`.
-- When a token exists as a part of a larger word, the linguistic information belonging to the larger word is bestowed on the token.
-- The English language is riddled with exceptions to tokenization rules. Sometimes, a punctuation is included in the middle of what is a single token (e.g., "Mr." or "N.Y."). Other times, contractions that look nothing like the words it combines (e.g., "ain't" looks nothing like "is not" or "am not" or "are not") create difficulties for aligning. To prevent these from being an issue, this repository replaces the exceptions to the language with their original "normalized" representations.
-
-**Specific to GPT2**
-- Sometimes, GPT2 tokenization will include a space before a punctuation mark that should not have been there. For example, the tokenization of "Hello Bob." should be `["Hello", "ĠBob", "."]`, but it is instead `["Hello", "ĠBob", "Ġ."]` This has not had any notable effects on performance, but note that it is different from the way the original model was pretrained. Hidden representations may be slightly different.
-
-### Known Issues
-- A Spacy exception that is part of a `-`-delimited word (e.g. "dont-touch-me") will cause the meta tokenization to produce a different result from the tokenization strategy. See github issues for a more detailed description of this problem.
-
-### Acknowledgements
-
-- Benjamin Hoover (IBM Research & MIT-IBM Watson AI Lab)
-- Hendrik Strobelt (IBM Research & MIT-IBM Watson AI Lab)
-- Sebastian Gehrmann (Harvard NLP)
diff --git a/server/spacyface/environment-dev.yml b/server/spacyface/environment-dev.yml
deleted file mode 100644
index 4a861dd6de14796558ef33336e47095507e2578e..0000000000000000000000000000000000000000
--- a/server/spacyface/environment-dev.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: spacyface
-channels:
-  - conda-forge
-dependencies:
-  - jupyter
-  - pytest
-  - jupyter_client
-  - jupyter_console
-  - jupyter_contrib_core
-  - jupyter_contrib_nbextensions
-  - matplotlib
-
-name: spacyface
-channels:
-  - conda-forge
-dependencies:
-  - jupyter
-  - pytest
-  - jupyter_client
-  - jupyter_console
-  - jupyter_contrib_core
-  - jupyter_contrib_nbextensions
-  - matplotlib
-  - seaborn
diff --git a/server/spacyface/environment.yml b/server/spacyface/environment.yml
deleted file mode 100644
index bedce3d71d19a407dd3e803616270cc80b1db53c..0000000000000000000000000000000000000000
--- a/server/spacyface/environment.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: spacyface
-channels:
-  - pytorch
-  - conda-forge
-  - defaults
-  - anaconda
-dependencies:
-  - python=3.7
-  - pip>=19.0.3
-  - pytest
-  - h5py
-  - spacy
-  - regex
-  - numpy
-  - pytorch
-  - sacremoses
-  - pip:
-    - sentencepiece
-    - transformers
diff --git a/server/spacyface/img/SampleHeatmap.png b/server/spacyface/img/SampleHeatmap.png
deleted file mode 100644
index 1f74743044a6fa5aea542f7bf5a4a8841f46db38..0000000000000000000000000000000000000000
Binary files a/server/spacyface/img/SampleHeatmap.png and /dev/null differ
diff --git a/server/spacyface/setup.cfg b/server/spacyface/setup.cfg
deleted file mode 100644
index 224a77957f5db48dfa25c8bb4a35f535202da203..0000000000000000000000000000000000000000
--- a/server/spacyface/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[metadata]
-description-file = README.md
\ No newline at end of file
diff --git a/server/spacyface/setup.py b/server/spacyface/setup.py
deleted file mode 100644
index e5b42c63f28eded67521df34bd7c075a2c53d076..0000000000000000000000000000000000000000
--- a/server/spacyface/setup.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from setuptools import setup, find_packages
-
-requires = [
-    'transformers>=2.3.0',
-    'h5py>=2.10.0',
-    'numpy>=1.17.4',
-    'regex>=2020.1.8',
-    'spacy>=2.2.3',
-    'torch',
-]
-
-setup(
-    name="spacyface",
-    description="Aligner for spacy and huggingface tokenization",
-    packages=['spacyface'],
-    version='0.2.1',
-    license='Apache 2.0',
-    author="Ben Hoover",
-    author_email="benjamin.hoover@ibm.com",
-    url="https://github.com/bhoov/spacyface",
-    keywords=["transformer", "pytorch", "spacy", "tokenize", "tokenization", "NLP", "Natural Language Processing",
-              "huggingface", "linguistic"],
-    include_package_data=True,
-    install_requires=requires,
-    classifiers=[
-        'Development Status :: 3 - Alpha',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        ],
-    python_requires='>=3.6, <3.8'
-)
diff --git a/server/spacyface/spacyface/__init__.py b/server/spacyface/spacyface/__init__.py
deleted file mode 100644
index da93663610b1d6a3db969dc13f8a3729ef3904ae..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from .aligner import (
-    MakeAligner,
-    BertAligner,
-    GPT2Aligner,
-    RobertaAligner,
-    DistilBertAligner,
-    TransfoXLAligner,
-    XLNetAligner,
-    AlbertAligner,
-    XLMAligner,
-    CTRLAligner,
-    OpenAIGPTAligner,
-    T5Aligner,
-    XLMRobertaAligner,
-    auto_aligner
-    
-)
-
-from .simple_spacy_token import SimpleSpacyToken
-
-__all__ = ["MakeAligner", "SimpleSpacyToken", "BertAligner", "GPT2Aligner", "RobertaAligner", "DistilBertAligner",
-           "TransfoXLAligner", "XLNetAligner", "AlbertAligner", "XLMAligner", "AlbertAligner",
-           "CTRLAligner", "OpenAIGPTAligner", "T5Aligner", "XLMRobertaAligner", "auto_aligner"]
diff --git a/server/spacyface/spacyface/aligner.py b/server/spacyface/spacyface/aligner.py
deleted file mode 100644
index ce5d2f651c63a0c3c97fe3c341e2649ca20b15c6..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/aligner.py
+++ /dev/null
@@ -1,261 +0,0 @@
-from typing import List, Iterable, Union
-import spacy
-from spacy.tokens.token import Token as SpacyToken
-from spacy.tokens.doc import Doc as SpacyDoc
-import torch
-import regex as re
-
-from transformers import (
-    AutoTokenizer,
-    BertTokenizer,
-    GPT2Tokenizer,
-    RobertaTokenizer,
-    DistilBertTokenizer,
-    TransfoXLTokenizer,
-    XLNetTokenizer,
-    XLMTokenizer,
-    AlbertTokenizer,
-    CTRLTokenizer,
-    T5Tokenizer,
-    XLMRobertaTokenizer,
-    OpenAIGPTTokenizer,
-    XLMRobertaTokenizer,
-    AutoTokenizer,
-)
-
-from .simple_spacy_token import SimpleSpacyToken
-from .utils.f import flatten_, assoc, delegates, memoize
-
-def doc_to_fixed_tokens(doc: SpacyDoc) -> List[str]:
-    """Fix the tokens in a document to not have exceptions"""
-    return [fix_token(t) for t in doc]
-
-def fix_token(tok: SpacyToken) -> str:
-    """Determine whether a token should be represented by its text or its norm
-
-    This works to fix most instances EXCEPT when an exception is part of a word with a '-' in it.
-    For example, "whatve-you-done" would produce two different tokenizations:
-
-    >>> alnr = BertAligner.from_pretrained('bert-base-uncased')
-    >>> s = "whatve-you-dont"
-    >>> alnr.tokenize(s) # => ['what', '##ve', '-', 'you', '-', 'don', '##t']
-    >>> [t.token for t in alnr.meta_tokenize(s)] # => ['what', 'have', '-', 'you', '-', 'do', 'not']
-
-    In practice, this situation occurs so rarely that it is often not a problem for real sentences to analyze.
-    """
-    out = tok.text if tok.text.lower() == tok.norm_ else tok.norm_
-
-    return out
-
-def MakeAligner(pretrained_tokenizer, spacy_language_model):
-    """Create an aligner from the pretrained tokenizers. Some caveats to note:
-
-    Usage:
-        BrandNewHuggingfaceAligner = MakeAligner(BrandNewHuggingfaceTokenizer)
-    """
-    class Aligner(pretrained_tokenizer):
-        @delegates(pretrained_tokenizer.__init__)
-        def __init__(self, **kwargs):
-            super().__init__(**kwargs)
-            self.spacy_nlp = spacy.load(spacy_language_model)
-            self.meta_container = SimpleSpacyToken
-
-        def prep_sentence(self, s: str) -> str:
-            """Remove contractions and multiple spaces from input sentence"""
-            s = re.sub(r"\s+", r" ", s).strip()
-            out = " ".join(self._to_normed_spacy(s))
-            return out
-
-        @delegates(pretrained_tokenizer.tokenize)
-        def tokenize(self, s: str, **kwargs) -> List[str]:
-            s = self.prep_sentence(s)
-            return super().tokenize(s, **kwargs)
-
-        def meta_tokenize(self, s: str) -> List[SimpleSpacyToken]:
-            """Tokenize the sentence and return the metadata for it according to Spacy
-
-            Due to implementation differences, does not provide the exact same API as the
-            PreTrainedTokenizer's `tokenize` function
-            """
-            meta_info = self._to_spacy_meta(self.prep_sentence(s))
-            return self._tokenize_from_spacy_meta(meta_info)
-
-        def meta_from_tokens(self, sentence: str, tokens: List[str], perform_check=True) -> List[SimpleSpacyToken]:
-            """Convert existing tokens into their metadata, ignoring effects of special tokens from the tokenizer
-
-            NOTE that the sentence MUST be the same sentence that produced the tokens, otherwise,
-            an unpredictable error may occur. Or worse, it will act like it works when it in fact doesn't.
-
-            Parameters:
-                - sentence: Sentence the tokens came from
-                - tokens: Tokenized version of the sentence. Can be post encoding or pre-encoding
-                    (where special tokens are added)
-                - perform_check: If True, check that the tokens come from the sentence. This slows down processing
-                    and should be False if speed is more important than accuracy
-            """
-            orig_meta = self.meta_tokenize(sentence)
-
-            new_meta = []
-            j = 0
-
-            # Unfortunately, this can really slow down predictions.
-            if perform_check:
-                is_encoded = self.encode(sentence) == self.convert_tokens_to_ids(tokens)
-                is_tokenized = self.tokenize(sentence) == tokens
-                assert is_encoded or is_tokenized, "Can only take tokens that come from the original sentence!"
-
-            for i, b in enumerate(tokens):
-                if b in self.all_special_tokens:
-                    new_meta.append(self.meta_container(b))
-                else:
-                    new_meta.append(orig_meta[j])
-                    j += 1
-
-            return new_meta
-
-        def _to_normed_spacy(self, s: str) -> List[str]:
-            """Return the normalized tokens (i.e., language exceptions replaced by a lowercased version)"""
-            doc = self.spacy_nlp(s)
-            tokens = self._doc_to_fixed_tokens(doc)
-            return tokens
-
-        def _to_spacy_meta(self, s: str) -> List[SimpleSpacyToken]: # list of simple spacy tokens...
-            """Convert a string into a list of records containing simplified spacy information"""
-            doc = self.spacy_nlp(s)
-            out = [self.meta_container(t) for t in doc]
-            return out
-
-        @delegates(pretrained_tokenizer.tokenize)
-        def _raw_tokenize(self, s: str, **kwargs) -> List[str]:
-            """This bypasses the custom tokenization for the tokenization of the original model."""
-            return super().tokenize(s, **kwargs)
-
-        def _to_raw_spacy(self, s: str) -> List[str]:
-            """Return the raw spacy tokens of a string"""
-            doc = self.spacy_nlp(s)
-            tokens = [t.text for t in doc]
-            return tokens
-
-        def _tokenize_from_spacy_meta(self, spacy_meta: List[SimpleSpacyToken]) -> List[SimpleSpacyToken]:
-            """Convert spacy-tokenized SimpleSpacyTokens into the appropriate tokenizer's tokens"""
-            out = [self._tokenize_from_meta_single(sm, i) for i, sm in enumerate(spacy_meta)]
-            return flatten_(out)
-
-        def _tokenize_from_meta_single(self, meta_token: SimpleSpacyToken, idx:int) -> List[SimpleSpacyToken]:
-            """Split a single spacy token with metadata into tokenizer tokens.
-
-            Because the transformer's tokenizer may split each Spacy-tokenized word into multiple subwords,
-            output a list
-
-            For GPT2 tokenization, there is a different behavior for the tokenization of a word if it
-            starts the sentence vs if it occurs later in the sentence.
-            """
-            BUFFER = "X " # GPT tokenization fusses if it thinks the token is the beginning of the sentence
-
-            def choose_norm(t):
-                return t['token'] if t['token'].lower() == t['norm'] else t['norm']
-
-            tok = choose_norm(meta_token)
-
-            if idx != 0:
-                s = BUFFER + tok # Add a buffer with guaranteed tokenization of length 1 to input
-                offset = 1
-            else:
-                s = tok
-                offset = 0
-
-            bpe_tokens = super().tokenize(s) # Can't do `self.tokenize` because it will normalize again
-
-            # Functional version that works with dictionaries
-            return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]
-
-        def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
-            """Extract tokens from a document, accounting for exceptions only if needed"""
-            tokens = doc_to_fixed_tokens(doc)
-            return tokens
-
-        def _maybe_conv_to_token(self, tok_or_str:Union[str, SimpleSpacyToken]):
-            """Convert a token to a SimpleSpacy token if a string. Otherwise, return input unmodified
-
-            Args:
-                tok_or_str: The token be analyzed
-
-            Returns:
-                SimpleSpacyToken. If input was a string, it has been converted to this class.
-            """
-
-            if isinstance(tok_or_str, SimpleSpacyToken):
-                return tok_or_str
-            return SimpleSpacyToken(self.convert_ids_to_tokens([tok_or_str])[0])
-
-        def sentence_to_input(self, sentence:str):
-            """Convert sentence to the input needed for a huggingface model
-
-            Args:
-                sentence: Sentence to prepare to send into the model
-
-            Returns:
-                Tuple of (object that can be directly passed into the model, modified meta tokens)
-
-            Examples:
-
-                >>> alnr = RobertaAligner.from_pretrained('roberta-base')
-                >>> model = AutoModel.from_pretrained('roberta-base', output_attentions=True)
-                >>> model.eval() # Remove DropOut effect
-                >>> model_input, meta_info = alnr.sentence_to_input(sentence)
-                >>> last_layer_hidden_state, pooler, atts = model(**model_input)
-            """
-
-            meta_tokens = self.meta_tokenize(sentence)
-            tokens = [tok.token for tok in meta_tokens]
-            ids = self.convert_tokens_to_ids(tokens)
-            raw_model_input = self.prepare_for_model(ids, add_special_tokens=True)
-            model_input = {k: torch.tensor(v).unsqueeze(0) for k,v in raw_model_input.items() if isinstance(v, List)}
-
-            meta_input = self.prepare_for_model(meta_tokens)['input_ids']
-            new_meta = list(map(self._maybe_conv_to_token, meta_input))
-
-            return model_input, new_meta
-
-        def check_tokenization(self, sentence:str, hard_assert=True):
-            tokens = self.tokenize(sentence)
-            meta_tokens = self.meta_tokenize(sentence)
-            mtokens = [m.token for m in meta_tokens]
-
-            error_str = """Meta tokenization did not match expected tokenization!
-
-            EXPECTED:
-            {}
-
-            META TOKENS REPORTED:
-            {}
-
-            """
-            is_fine = mtokens == tokens
-
-            if hard_assert:
-                assert is_fine, error_str.format(tokens, mtokens)
-            else:
-                if not is_fine: print(error_str.format(tokens, mtokens))
-
-    return Aligner
-
-english = "en_core_web_sm"
-
-BertAligner = MakeAligner(BertTokenizer, english)
-GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
-RobertaAligner = MakeAligner(RobertaTokenizer, english)
-DistilBertAligner = MakeAligner(DistilBertTokenizer, english)
-TransfoXLAligner = MakeAligner(TransfoXLTokenizer, english)
-XLNetAligner = MakeAligner(XLNetTokenizer, english)
-XLMAligner = MakeAligner(XLMTokenizer, english)
-CTRLAligner = MakeAligner(CTRLTokenizer, english)
-AlbertAligner = MakeAligner(AlbertTokenizer, english)
-OpenAIGPTAligner= MakeAligner(OpenAIGPTTokenizer, english)
-T5Aligner= MakeAligner(T5Tokenizer, english)
-XLMRobertaAligner= MakeAligner(XLMRobertaTokenizer, english)
-
-@memoize
-def auto_aligner(pretrained_name_or_path):
-    tok_class = AutoTokenizer.from_pretrained(pretrained_name_or_path).__class__
-    return MakeAligner(tok_class, english).from_pretrained(pretrained_name_or_path)
\ No newline at end of file
diff --git a/server/spacyface/spacyface/checker/__init__.py b/server/spacyface/spacyface/checker/__init__.py
deleted file mode 100644
index 73f88daf6514dd1e623b1c31c7b8f01059db21df..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/checker/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Use to verify an aligner for a particular application"""
-from .against_corpus import check_against_corpus
-
-__all__ = ["check_against_corpus"]
diff --git a/server/spacyface/spacyface/checker/against_corpus.py b/server/spacyface/spacyface/checker/against_corpus.py
deleted file mode 100644
index b774fe02a4dbadb683036fa6f4258521a03a5606..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/checker/against_corpus.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""This module provides a a means to test an aligner against a desired corpus"""
-
-from pathlib import Path
-import argparse
-from spacyface.utils.sentence_extracting import extract_chars
-from spacyface import *
-
-
-def check_against_corpus(alnr, corpus_name, hard_assert=True):
-    """Go through every sentence of the corpus and see if the meta tokenization is different than base transformer tokenization
-
-    Args:
-        alnr: Aligner
-        corpus_name: Name of text file to parse
-        hard_assert: If True, break on first error. Otherwise, print error msg and continue
-    """
-    src = open(corpus_name)
-    chunk_gen = extract_chars(src, 100000)
-    for c, chunk in enumerate(chunk_gen):
-        doc = alnr.spacy_nlp(chunk)
-        sents = [sent.text for sent in doc.sents]
-        for i, sent in enumerate(sents):
-            if i % 100 == 0: print(f"Chunk {c}. Sentence {i}")
-            alnr.check_tokenization(sent, hard_assert)
-
-    src.close()
diff --git a/server/spacyface/spacyface/simple_spacy_token.py b/server/spacyface/spacyface/simple_spacy_token.py
deleted file mode 100644
index a5297ae9e14d3e552d997fd38f9d886c184f4394..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/simple_spacy_token.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""
-Describes the structure of a language token represented by Spacy-extracted metadata
-
-"""
-import h5py
-import numpy as np
-from spacy.tokens.token import Token as SpacyToken
-from typing import Union, List, Tuple
-
-
-def check_ent(tok: SpacyToken):
-    """Check whether token is an entity
-
-    Default Spacy Token does not assume what kind of entity you are looking for, but
-    provides the following denotations:
-
-        0: No entity tag is set
-        1: inside an entity
-        2: outside an entity
-        3: Token begins an entity
-    
-    Args:
-        tok: The Spacy Token
-
-    Returns:
-        Boolean indicating whether or not token is an entity    
-    """
-    OUT_OF_ENT = 2
-    NO_ENT_DEFINED = 0
-    return tok.ent_iob != OUT_OF_ENT and tok.ent_iob != NO_ENT_DEFINED
-
-class SimpleSpacyToken():
-    """A wrapper around a Spacy token to extract desired information
-
-    This class implements a basic functional dictionary-like wrapper around the spacy token to 
-    make it easy to mutate and export attributes without directly changing state. Any attribute
-    that is not prefixed by '_' is considered a key of this class.
-
-    The design allows for the token to have no metadata by simply passing a `str` into
-    the constructor.
-
-    Attributes:
-        token: str
-        pos: str
-        dep: str
-        norm: str
-        tag: str
-        lemma: str
-        head: str
-        is_ent: bool
-
-    Notes:
-        If exporting to an HDF5 file, make sure to define what hdf5 datatype that attribute 
-        represents by changing the corresponding tuple in 'hdf5_token_dtype'
-    """
-
-    # Define how each attribute is stored in an hdf5 file 
-    # Names MUST match attributes of this class
-    hdf5_token_dtype = [
-        ("token", h5py.special_dtype(vlen=str)),
-        ("pos", h5py.special_dtype(vlen=str)),
-        ("dep", h5py.special_dtype(vlen=str)),
-        ("norm", h5py.special_dtype(vlen=str)),
-        ("tag", h5py.special_dtype(vlen=str)),
-        ("lemma", h5py.special_dtype(vlen=str)),
-        ("head", h5py.special_dtype(vlen=str)),
-        ("is_ent", np.bool_),
-    ]
-
-    def __init__(self, t:Union[SpacyToken, str]):
-        """Create a simplified version of a spacy token
-        
-        Args:
-            t: A string or Spacy Token object to wrap
-
-        Raises:
-            ValueError: If input is not of type SpacyToken or str
-        """
-        self._orig_token = t
-
-        if type(t) == SpacyToken:
-            self.token = t.text
-            self.pos = t.pos_
-            self.dep = t.dep_
-            self.norm = t.norm_
-            self.tag = t.tag_
-            self.lemma = t.lemma_
-            self.head = t.head
-            self.is_ent = check_ent(t)
-
-        elif type(t) == str:
-            self.token = t
-            self.pos = None
-            self.dep = None
-            self.norm = None
-            self.tag = None
-            self.lemma = None
-            self.head = None
-            self.is_ent = None
-
-        else:
-            raise ValueError("Expected input of SpacyToken or str")
-
-    def pick(self, keys:List[str]):
-        """Return subset of the attributes specified in 'keys' as a simple dictioniary
-        
-        Args:
-            keys: List of keys to extract
-
-        Returns:
-            Dictionary of only k in keys
-
-        Raises:
-            KeyError: If k in 'keys' is not an attribute
-
-        """
-        return {k: self[k] for k in keys}
-
-    def assoc(self, key:str, value):
-        """Set the 'key' to the 'value', returning a new instance of this class.
-        
-        Args:
-            key: Key that receives the value
-            value: Value to assign to the key
-
-        Returns:
-            A new instance of this class with the modified key:value pair
-        """
-        out = SimpleSpacyToken(self._orig_token)
-        out[key] = value
-        return out
-
-    def __getitem__(self, key):
-        """Access the key from this objects dictionary"""
-        return self.__dict__[key]
-
-    def __setitem__(self, key, value):
-        """Assign, in place, the value to the key"""
-        self.__dict__[key] = value
-
-    def keys(self) -> List[str]:
-        """Return a list of all attributes that don't start with '_'"""
-        return [k for k in self.__dict__.keys() if not k.startswith('_')]
-
-    def values(self) -> List:
-        """Return a list of all values whose keys don't start with '_'"""
-        
-        return [v for _, v in self.__dict__.items() if not k.startswith('_')]
-
-    def items(self) -> List[Tuple]:
-        """Return a list of all items whose keys don't start with '_'"""
-        return [(k, v) for k,v in self.__dict__.items() if not k.startswith('_')]
-
-    def __repr__(self):
-        return f"SimpleSpacyToken: {self.items()}"
\ No newline at end of file
diff --git a/server/spacyface/spacyface/utils/f.py b/server/spacyface/spacyface/utils/f.py
deleted file mode 100644
index be109047c8f2c37a2881339536c5cafa62a03623..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/utils/f.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""General programming utils, inclined toward functional programming.
-
-If ever a function changes its input in place, it is denoted by a trailing `_`
-"""
-
-import inspect
-from itertools import zip_longest
-from typing import List, Set, Union, Dict
-
-
-def ifnone(*xs):
-    """Return the first item in 'x' that is not None"""
-    for x in xs: 
-        if x is not None: return x
-    return None
-
-def custom_dir(c, add): return dir(type(c)) + list(c.__dict__.keys()) + add
-
-class GetAttr:
-    """Base class for attr accesses in `self._xtra` passed down to `self.default`
-    
-    Taken from article by Jeremy Howard: https://www.fast.ai/2019/08/06/delegation/
-
-    Usage:
-
-        ```
-        class ProductPage(GetAttr):
-            def __init__(self, page, price, cost):
-                self.page,self.price,self.cost = page,price,cost
-                self.default = page
-        ```
-    """
-    @property
-    def _xtra(self): return [o for o in dir(self.default) if not o.startswith('_')]
-    def __getattr__(self,k):
-        if k in self._xtra: return getattr(self.default, k)
-        raise AttributeError(k)
-    def __dir__(self): return custom_dir(self, self._xtra)
-
-# Can i delegate many different functions?
-# Can i add a new docstring to the existing docstring of the delgated function? Or at least point to the function delegated?
-def delegates(to=None, keep=False):
-    """ Decorator: replace `**kwargs` in signature with params from `to`.
-    
-    Taken from article by Jeremy Howard: https://www.fast.ai/2019/08/06/delegation/
-    """
-    
-    def _f(f):
-        if to is None: to_f,from_f = f.__base__.__init__,f.__init__
-        else:          to_f,from_f = to,f
-        sig = inspect.signature(from_f)
-        sigd = dict(sig.parameters)
-        k = sigd.pop('kwargs')
-        s2 = {k:v for k,v in inspect.signature(to_f).parameters.items()
-              if v.default != inspect.Parameter.empty and k not in sigd}
-        sigd.update(s2)
-        if keep: sigd['kwargs'] = k
-        from_f.__signature__ = sig.replace(parameters=sigd.values())
-        return f
-    return _f
-
-def pick(keys:Union[List, Set], obj:Dict) -> Dict:
-    """ Return a NEW object containing `keys` from the original `obj` """
-    return {k: obj[k] for k in keys}
-
-def memoize(f):
-    """Memoize a function.
-    
-    Use lookup table when the same inputs are passed to the function instead of running that function again
-    """
-    memo = {}
-    def helper(*x):
-        if x not in memo:            
-            memo[x] = f(*x)
-        return memo[x]
-    return helper
-
-def assoc(k, v, orig):
-    """Given an original dictionary orig, return a cloned dictionary with `k` set to `v`"""
-    out = orig.copy()
-    out[k] = v
-    return out
-
-def make_unique(f):
-    """The input function will only run and return if it hasn't seen its argument before. 
-    
-    Otherwise, it will return `None`.
-    """
-    s = set()
-    def helper(x):
-        if x in s:
-            return None
-        s.add(x)
-        return f(x)
-
-    return helper
-
-def flatten_(items, seqtypes=(list, tuple)):
-    """Flattten an arbitrarily nested list IN PLACE"""
-    for i, x in enumerate(items):
-        while i < len(items) and isinstance(items[i], seqtypes):
-            items[i:i+1] = items[i]
-    return items
\ No newline at end of file
diff --git a/server/spacyface/spacyface/utils/sentence_extracting.py b/server/spacyface/spacyface/utils/sentence_extracting.py
deleted file mode 100644
index 13dd6d66d0ae4d332914415f354dfc27c067fb8d..0000000000000000000000000000000000000000
--- a/server/spacyface/spacyface/utils/sentence_extracting.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""Extractor functions to retrieve sentences by character chunks from a file
-
-This script contains the logic that allows the user to process and filter
-sentences of the original corpus. By default, this considers a minimum sentence
-length, and removes newlines and multiple consecutive spaces.
-
-Configuration for existing functionality is at the top of the file. Feel free to
-add new processing and/or filter functions. The "process_line" and "filter_line"
-functions contain the pipeline for processing the scripts as needed.
-
-"""
-import regex as re
-import argparse
-from pathlib import Path
-from functools import partial
-from typing import Union
-
-MIN_LINE_LENGTH = 8 # words
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
-    parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")
-
-
-    args = parser.parse_args()
-    return args
-
-# ============================================================
-#                  Helper functions
-# ============================================================
-# String -> String
-def replace_newlines(s:str) -> str:
-    return re.sub(r"\n+", r" ", s)
-
-# String -> String
-def replace_multispace(s:str) -> str:
-    return re.sub(r"\s+", r" ", s)
-
-def is_short_sentence(s:str, min_len=8) -> str:
-    """Returns True if the sentence has less than `min_len` number of words"""
-    return len(s.split(' ')) < min_len
-
-def contains_char(char:str, s:str) -> str:
-    return char in s
-
-# ============================================================
-#                  Compilation functions
-# ============================================================
-
-def process_line(line:str) -> str:
-    """"Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.
-
-    Args:
-        line: Chunk of text
-
-    Returns:
-        Input that has been stripped of newlines and multiple consecutive spaces.
-    """
-    s = replace_multispace(replace_newlines(line))
-    return s
-
-def filter_line(line:str) -> bool:
-    """Returns True if the sentence passes the MIN_LINE_LENGTH configuration
-
-    Redefine this function with desired helper functions, returning true if you want to keep the line
-    """
-    fails = is_short_sentence(line, MIN_LINE_LENGTH)
-
-    return not fails
-
-# ============================================================
-#                      Main Logic
-# ============================================================
-
-def read_outcomes(chars:str) -> Union[str, None]:
-    """From a chunk of characters, decide whether to return the processed characters or Nothing.
-
-    If the input is the empty string "", raise StopIteration
-
-    Args:
-        chars: Chunk of text to process
-
-    Returns:
-        The processed chunk of text or nothing if the characters do not pass the filtering
-
-    Raises:
-        StopIteration: If the input is the empty string "", raise StopIteration
-    """
-
-    if chars == '': raise StopIteration
-    line = process_line(chars)
-    if filter_line(line): return line
-    return None
-
-def get_chars(n:int, f) -> Union[str, None]:
-    """Extract `n` chars from opened file `f`
-
-    Args:
-        n: Number of characters to read from the opened file
-        f: Opened file from the return of `open(fname)`
-
-    Returns:
-        The processed chunk of text or nothing if the characters do not pass the filtering
-
-    Raises:
-        This function does not raise any errors of its own, but can pass up the StopIteration exception
-          from read_outcomes
-    """
-    chars = f.read(n)
-    return read_outcomes(chars)
-
-def get_line(f):
-    """Given an open file, get the next line and process it. Handles 3 scenarios:
-
-    1. StopIteration indicates the opened file has reached the end
-    2. Return a processed line if it passes the filter
-    3. If line does not pass the filter line, return None
-    """
-    line = f.readline()
-    return read_outcomes(line)
-
-def read_on(reader, f):
-    """Read from an open file `f` according to the function `reader`
-
-    Args:
-        reader: A unary function of signature (f: _io.TextIOWrapper) -> str
-        f: An opened file, as returned by `open(fname)`
-
-    Yields:
-        A generator that returns lines defined by `reader` until the end of the file is reached.
-    """
-    while True:
-        try:
-            line = reader(f)
-        except StopIteration:
-            break
-
-        if line is not None:
-            yield line
-
-
-def extract_chars(infile, n=10000):
-    """Extract `n` characters in batches from opened `infile`"""
-    reader = partial(get_chars, n)
-    return read_on(reader, infile)
-
-def extract_lines(infile):
-    """Given a file, yield the processed lines from that file"""
-    with open(infile, 'r') as src:
-        return read_on(get_line, src)
-
-def extract_sentences_to_file(infile, outfname:str):
-    """Extract sentences from a file into a new file indicated by `outfname`."""
-    out = open(outfname, 'x')
-
-    linegen = extract_lines(infile)
-
-    for line in linegen:
-        out.write(line + "\n")
-
-    out.close()
-
-def main(infile, outdir):
-    """Main function for creating the outdir and saving the processed sentences to that file"""
-    outfname = Path(infile).stem + '.txt'
-    outdir = Path(outdir)
-    outdir.mkdir(parents=True, exist_ok=True)
-    outfile = outdir / outfname
-    out_path = extract_sentences_to_file(infile, outfile)
-
-    return out_path
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args.file, args.outdir)
diff --git a/server/spacyface/tests/EN_TEST_SENTS.py b/server/spacyface/tests/EN_TEST_SENTS.py
deleted file mode 100644
index d0e48b24dedfb952b047a245551b40d5ef7f2fca..0000000000000000000000000000000000000000
--- a/server/spacyface/tests/EN_TEST_SENTS.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""A collection of english test sentences to use when testing the aligners"""
-
-SPACY_EN_TEST_SENTS = [
-    'the LIFE',
-    'the LIFEST',
-    'the LIFESTPHSESDF',
-    'the LI FE ST',
-    "I can't understand for the LIFE of me why we Aren't going home",
-    "There is nothing I can say or do... that will <MAKE> me do what YOU want!!",
-    "This ain't going to mess me up, Ain't it?",
-    "It's tonsa fun in the whatve whatve@you@don't U.K.",
-    "It's tonsa fun in the whatve whatve_you_dont U.K.",
-    ]
-
-BROKEN_EN_TEST_SENTS = [
-    "It's tonsa fun in the whatve whatve-you-dont U.K.",
-    "It's tonsa fun in the whatve whatve-you-done U.K.",
-]
diff --git a/server/spacyface/tests/__init__.py b/server/spacyface/tests/__init__.py
deleted file mode 100644
index a55e8c8c9edd0ede70b83dd5e3693c2e4080d027..0000000000000000000000000000000000000000
--- a/server/spacyface/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Temporary init. This is not meant to be a package"""
diff --git a/server/spacyface/tests/test_aligner.py b/server/spacyface/tests/test_aligner.py
deleted file mode 100644
index b5a53866fc5e51bf39479d15177ec09091dc105a..0000000000000000000000000000000000000000
--- a/server/spacyface/tests/test_aligner.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from spacyface import *
-import pytest
-
-def load_sample_en_sents():
-    from .EN_TEST_SENTS import SPACY_EN_TEST_SENTS
-    return SPACY_EN_TEST_SENTS
-
-sentences = load_sample_en_sents()
-
-@pytest.mark.parametrize("model_name,alnr_class",
-                        [('bert-base-uncased', BertAligner),
-                         ('bert-base-cased', BertAligner),
-                         ('gpt2', GPT2Aligner),
-                         ('roberta-base', RobertaAligner),
-                         ('distilbert-base-uncased', DistilBertAligner),
-                         ('transfo-xl-wt103', TransfoXLAligner),
-                         ('xlnet-base-cased', XLNetAligner),
-                         ('xlm-mlm-en-2048', XLMAligner),
-                         ('ctrl', CTRLAligner),
-                         ('albert-base-v1', AlbertAligner),
-                         ('openai-gpt', OpenAIGPTAligner),
-                         ('xlm-roberta-base', XLMRobertaAligner),
-                         # ('t5-small', T5Aligner), # This does not currently work
-                        ])
-def test_aligner(model_name, alnr_class):
-    """NOTE: Will be obsolete when the aligner is able to work with transformer auto model"""
-    a = alnr_class.from_pretrained(model_name)
-
-    for s in sentences:
-        mtokens = [m['token'] for m in a.meta_tokenize(s)]
-        tokens = a.tokenize(s)
-        assert tokens == mtokens, f"{tokens} \n {mtokens}"
diff --git a/server/spacyface/tests/wiki.test.txt b/server/spacyface/tests/wiki.test.txt
deleted file mode 100644
index d9d791581935032220e10e28a04cfbd6460b72f5..0000000000000000000000000000000000000000
--- a/server/spacyface/tests/wiki.test.txt
+++ /dev/null
@@ -1,4358 +0,0 @@
- 
- = Robert Boulter = 
- 
- Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
- In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on a 2006 episode of the television series , Doctors , followed by a role in the 2007 theatre production of How to Curse directed by Josie Rourke . How to Curse was performed at Bush Theatre in the London Borough of Hammersmith and Fulham . Boulter starred in two films in 2008 , Daylight Robbery by filmmaker Paris Leonti , and Donkey Punch directed by Olly Blackburn . In May 2008 , Boulter made a guest appearance on a two @-@ part episode arc of the television series Waking the Dead , followed by an appearance on the television series Survivors in November 2008 . He had a recurring role in ten episodes of the television series Casualty in 2010 , as " Kieron Fletcher " . Boulter starred in the 2011 film Mercenaries directed by Paris Leonti . 
- 
- = = Career = = 
- 
- 
- = = = 2000 – 2005 = = = 
- 
- In 2000 Boulter had a guest @-@ starring role on the television series The Bill ; he portrayed " Scott Parry " in the episode , " In Safe Hands " . Boulter starred as " Scott " in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . A review of Boulter 's performance in The Independent on Sunday described him as " horribly menacing " in the role , and he received critical reviews in The Herald , and Evening Standard . He appeared in the television series Judge John Deed in 2002 as " Addem Armitage " in the episode " Political Expediency " , and had a role as a different character " Toby Steele " on The Bill . 
- He had a recurring role in 2003 on two episodes of The Bill , as character " Connor Price " . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . Boulter starred as " Darren " , in the 2005 theatre productions of the Philip Ridley play Mercury Fur . It was performed at the Drum Theatre in Plymouth , and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . Boulter received a favorable review in The Daily Telegraph : " The acting is shatteringly intense , with wired performances from Ben Whishaw ( now unrecognisable from his performance as Trevor Nunn 's Hamlet ) , Robert Boulter , Shane Zaza and Fraser Ayres . " The Guardian noted , " Ben Whishaw and Robert Boulter offer tenderness amid the savagery . " 
- 
- = = = 2006 – present = = = 
- 
- In 2006 Boulter starred in the play Citizenship written by Mark Ravenhill . The play was part of a series which featured different playwrights , titled Burn / Chatroom / Citizenship . In a 2006 interview , fellow actor Ben Whishaw identified Boulter as one of his favorite co @-@ stars : " I loved working with a guy called Robert Boulter , who was in the triple bill of Burn , Chatroom and Citizenship at the National . He played my brother in Mercury Fur . " He portrayed " Jason Tyler " on the 2006 episode of the television series , Doctors , titled " Something I Ate " . Boulter starred as " William " in the 2007 production of How to Curse directed by Josie Rourke . How to Curse was performed at Bush Theatre in the London Borough of Hammersmith and Fulham . In a review of the production for The Daily Telegraph , theatre critic Charles Spencer noted , " Robert Boulter brings a touching vulnerability to the stage as William . " 
- Boulter starred in two films in 2008 , Daylight Robbery by filmmaker Paris Leonti , and Donkey Punch directed by Olly Blackburn . Boulter portrayed a character named " Sean " in Donkey Punch , who tags along with character " Josh " as the " quiet brother ... who hits it off with Tammi " . Boulter guest starred on a two @-@ part episode arc " Wounds " in May 2008 of the television series Waking the Dead as character " Jimmy Dearden " . He appeared on the television series Survivors as " Neil " in November 2008 . He had a recurring role in ten episodes of the television series Casualty in 2010 , as " Kieron Fletcher " . He portrayed an emergency physician applying for a medical fellowship . He commented on the inherent difficulties in portraying a physician on television : " Playing a doctor is a strange experience . Pretending you know what you 're talking about when you don 't is very bizarre but there are advisers on set who are fantastic at taking you through procedures and giving you the confidence to stand there and look like you know what you 're doing . " Boulter starred in the 2011 film Mercenaries directed by Paris Leonti . 
- 
- = = Filmography = = 
- 
- 
- = = = Film = = = 
- 
- 
- = = = Television = = = 
- 
- 
- = = = Theatre = = = 
- 
- 
- 
- = Du Fu = 
- 
- Du Fu ( Wade – Giles : Tu Fu ; Chinese : 杜甫 ; 712 – 770 ) was a prominent Chinese poet of the Tang dynasty . Along with Li Bai ( Li Po ) , he is frequently called the greatest of the Chinese poets . His greatest ambition was to serve his country as a successful civil servant , but he proved unable to make the necessary accommodations . His life , like the whole country , was devastated by the An Lushan Rebellion of 755 , and his last 15 years were a time of almost constant unrest . 
- Although initially he was little @-@ known to other writers , his works came to be hugely influential in both Chinese and Japanese literary culture . Of his poetic writing , nearly fifteen hundred poems have been preserved over the ages . He has been called the " Poet @-@ Historian " and the " Poet @-@ Sage " by Chinese critics , while the range of his work has allowed him to be introduced to Western readers as " the Chinese Virgil , Horace , Ovid , Shakespeare , Milton , Burns , Wordsworth , Béranger , Hugo or Baudelaire " . 
- 
- = = Life = = 
- 
- Traditional Chinese literary criticism emphasized the life of the author when interpreting a work , a practice which Burton Watson attributes to " the close links that traditional Chinese thought posits between art and morality " . Since many of Du Fu 's poems feature morality and history , this practice is particularly important . Another reason , identified by the Chinese historian William Hung , is that Chinese poems are typically concise , omitting context that might be relevant , but which an informed contemporary could be assumed to know . For modern Western readers , " The less accurately we know the time , the place and the circumstances in the background , the more liable we are to imagine it incorrectly , and the result will be that we either misunderstand the poem or fail to understand it altogether " . Stephen Owen suggests a third factor particular to Du Fu , arguing that the variety of the poet 's work required consideration of his whole life , rather than the " reductive " categorizations used for more limited poets . 
- 
- = = = Early years = = = 
- 
- Most of what is known of Du Fu 's life comes from his poems . His paternal grandfather was Du Shenyan , a noted politician and poet during the reign of Empress Wu . Du Fu was born in 712 ; the exact birthplace is unknown , except that it was near Luoyang , Henan province ( Gong county is a favourite candidate ) . In later life , he considered himself to belong to the capital city of Chang 'an , ancestral hometown of the Du family . 
- Du Fu 's mother died shortly after he was born , and he was partially raised by his aunt . He had an elder brother , who died young . He also had three half brothers and one half sister , to whom he frequently refers in his poems , although he never mentions his stepmother . 
- The son of a minor scholar @-@ official , his youth was spent on the standard education of a future civil servant : study and memorisation of the Confucian classics of philosophy , history and poetry . He later claimed to have produced creditable poems by his early teens , but these have been lost . 
- In the early 730s , he travelled in the Jiangsu / Zhejiang area ; his earliest surviving poem , describing a poetry contest , is thought to date from the end of this period , around 735 . In that year , he took the civil service exam , likely in Chang 'an . He failed , to his surprise and that of centuries of later critics . Hung concludes that he probably failed because his prose style at the time was too dense and obscure , while Chou suggests his failure to cultivate connections in the capital may have been to blame . After this failure , he went back to traveling , this time around Shandong and Hebei . 
- His father died around 740 . Du Fu would have been allowed to enter the civil service because of his father 's rank , but he is thought to have given up the privilege in favour of one of his half brothers . He spent the next four years living in the Luoyang area , fulfilling his duties in domestic affairs . 
- In the autumn of 744 , he met Li Bai ( Li Po ) for the first time , and the two poets formed a friendship . David Young describes this as " the most significant formative element in Du Fu 's artistic development " because it gave him a living example of the reclusive poet @-@ scholar life to which he was attracted after his failure in the civil service exam . The relationship was somewhat one @-@ sided , however . Du Fu was by some years the younger , while Li Bai was already a poetic star . We have twelve poems to or about Li Bai from the younger poet , but only one in the other direction . They met again only once , in 745 . 
- In 746 , he moved to the capital in an attempt to resurrect his official career . He took the civil service exam a second time during the following year , but all the candidates were failed by the prime minister ( apparently in order to prevent the emergence of possible rivals ) . He never again attempted the examinations , instead petitioning the emperor directly in 751 , 754 and probably again in 755 . He married around 752 , and by 757 the couple had had five children — three sons and two daughters — but one of the sons died in infancy in 755 . From 754 he began to have lung problems ( probably asthma ) , the first of a series of ailments which dogged him for the rest of his life . It was in that year that Du Fu was forced to move his family due to the turmoil of a famine brought about by massive floods in the region . 
- In 755 , he received an appointment as Registrar of the Right Commandant 's office of the Crown Prince 's Palace . Although this was a minor post , in normal times it would have been at least the start of an official career . Even before he had begun work , however , the position was swept away by events . 
- 
- = = = War = = = 
- 
- The An Lushan Rebellion began in December 755 , and was not completely suppressed for almost eight years . It caused enormous disruption to Chinese society : the census of 754 recorded 52 @.@ 9 million people , but ten years later , the census counted just 16 @.@ 9 million , the remainder having been displaced or killed . During this time , Du Fu led a largely itinerant life unsettled by wars , associated famines and imperial displeasure . This period of unhappiness was the making of Du Fu as a poet : Even Shan Chou has written that , " What he saw around him — the lives of his family , neighbors , and strangers – what he heard , and what he hoped for or feared from the progress of various campaigns — these became the enduring themes of his poetry " . Even when he learned of the death of his youngest child , he turned to the suffering of others in his poetry instead of dwelling upon his own misfortunes . Du Fu wrote : 
- Brooding on what I have lived through , if even I know such suffering , the common man must surely be rattled by the winds . 
- In 756 , Emperor Xuanzong was forced to flee the capital and abdicate . Du Fu , who had been away from the city , took his family to a place of safety and attempted to join the court of the new emperor ( Suzong ) , but he was captured by the rebels and taken to Chang 'an . In the autumn , his youngest son , Du Zongwu ( Baby Bear ) , was born . Around this time Du Fu is thought to have contracted malaria . 
- He escaped from Chang 'an the following year , and was appointed Reminder when he rejoined the court in May 757 . This post gave access to the emperor but was largely ceremonial . Du Fu 's conscientiousness compelled him to try to make use of it : he caused trouble for himself by protesting the removal of his friend and patron Fang Guan on a petty charge . He was arrested but was pardoned in June . He was granted leave to visit his family in September , but he soon rejoined the court and on December 8 , 757 , he returned to Chang 'an with the emperor following its recapture by government forces . However , his advice continued to be unappreciated , and in the summer of 758 he was demoted to a post as Commissioner of Education in Huazhou . The position was not to his taste : in one poem , he wrote : 
- I am about to scream madly in the office / Especially when they bring more papers to pile higher on my desk . 
- He moved on in the summer of 759 ; this has traditionally been ascribed to famine , but Hung believes that frustration is a more likely reason . He next spent around six weeks in Qinzhou ( now Tianshui , Gansu province ) , where he wrote more than sixty poems . 
- 
- = = = Chengdu = = = 
- 
- In December 759 , he briefly stayed in Tonggu ( modern Gansu ) . He departed on December 24 for Chengdu ( Sichuan province ) , where he was hosted by local Prefect and fellow poet Pei Di . Du subsequently based himself in Sichuan for most of the next five years . By the autumn of that year he was in financial trouble , and sent poems begging help to various acquaintances . He was relieved by Yan Wu , a friend and former colleague who was appointed governor general at Chengdu . Despite his financial problems , this was one of the happiest and most peaceful periods of his life . Many of Du 's poems from this period are peaceful depictions of his life at " thatched hut " . In 762 , he left the city to escape a rebellion , but he returned in summer 764 when he was appointed an advisor to Yan , who was involved in campaigns against the Tibetan Empire . 
- 
- = = = Last years = = = 
- 
- Luoyang , the region of his birthplace , was recovered by government forces in the winter of 762 , and in the spring of 765 Du Fu and his family sailed down the Yangtze , apparently with the intention of making their way there . They traveled slowly , held up by his ill @-@ health ( by this time he was suffering from poor eyesight , deafness and general old age in addition to his previous ailments ) . They stayed in Kuizhou ( in what is now Baidicheng , Chongqing ) at the entrance to the Three Gorges for almost two years from late spring 766 . This period was Du Fu 's last great poetic flowering , and here he wrote 400 poems in his dense , late style . In autumn 766 , Bo Maolin became governor of the region : he supported Du Fu financially and employed him as his unofficial secretary . 
- In March 768 , he began his journey again and got as far as Hunan province , where he died in Tanzhou ( now Changsha ) in November or December 770 , in his 58th year . He was survived by his wife and two sons , who remained in the area for some years at least . His last known descendant is a grandson who requested a grave inscription for the poet from Yuan Zhen in 813 . 
- Hung summarises his life by concluding that , " He appeared to be a filial son , an affectionate father , a generous brother , a faithful husband , a loyal friend , a dutiful official , and a patriotic subject . " 
- Below is an example of one of Du Fu 's later works , To My Retired Friend Wei ( Chinese : 贈衛八處士 ) . Like many other poems in the Tang it featured the theme of a long parting between friends , which was often due to officials being frequently transferred to the provinces : 
- 
- = = Works = = 
- 
- Criticism of Du Fu 's works has focused on his strong sense of history , his moral engagement , and his technical excellence . 
- 
- = = = History = = = 
- 
- Since the Song dynasty , critics have called Du Fu the " poet historian " ( 詩史 shī shǐ ) . The most directly historical of his poems are those commenting on military tactics or the successes and failures of the government , or the poems of advice which he wrote to the emperor . Indirectly , he wrote about the effect of the times in which he lived on himself , and on the ordinary people of China . As Watson notes , this is information " of a kind seldom found in the officially compiled histories of the era " . 
- Du Fu 's political comments are based on emotion rather than calculation : his prescriptions have been paraphrased as , " Let us all be less selfish , let us all do what we are supposed to do " . Since his views were impossible to disagree with , his forcefully expressed truisms enabled his installation as the central figure of Chinese poetic history . 
- 
- = = = Moral engagement = = = 
- 
- A second favourite epithet of Chinese critics is that of " poet sage " ( 詩聖 shī shèng ) , a counterpart to the philosophical sage , Confucius . One of the earliest surviving works , The Song of the Wagons ( from around 750 ) , gives voice to the sufferings of a conscript soldier in the imperial army and a clear @-@ sighted consciousness of suffering . These concerns are continuously articulated in poems on the lives of both soldiers and civilians produced by Du Fu throughout his life . 
- Although Du Fu 's frequent references to his own difficulties can give the impression of an all @-@ consuming solipsism , Hawkes argues that his " famous compassion in fact includes himself , viewed quite objectively and almost as an afterthought " . He therefore " lends grandeur " to the wider picture by comparing it to " his own slightly comical triviality " . 
- Du Fu 's compassion , for himself and for others , was part of his general broadening of the scope of poetry : he devoted many works to topics which had previously been considered unsuitable for poetic treatment . Zhang Jie wrote that for Du Fu , " everything in this world is poetry " , Du wrote extensively on subjects such as domestic life , calligraphy , paintings , animals , and other poems . 
- 
- = = = Technical excellence = = = 
- 
- Du Fu 's work is notable above all for its range . Chinese critics traditionally used the term 集大成 ( jídàchéng- " complete symphony " ) , a reference to Mencius ' description of Confucius . Yuan Zhen was the first to note the breadth of Du Fu 's achievement , writing in 813 that his predecessor , " united in his work traits which previous men had displayed only singly " . He mastered all the forms of Chinese poetry : Chou says that in every form he " either made outstanding advances or contributed outstanding examples " . Furthermore , his poems use a wide range of registers , from the direct and colloquial to the allusive and self @-@ consciously literary . This variety is manifested even within individual works : Owen identifies the , " rapid stylistic and thematic shifts " in poems which enable the poet to represent different facets of a situation , while Chou uses the term " juxtaposition " as the major analytical tool in her work . Du Fu is noted for having written more on poetics and painting than any other writer of his time . He wrote eighteen poems on painting alone , more than any other Tang poet . Du Fu 's seemingly negative commentary on the prized horse paintings of Han Gan ignited a controversy that has persisted to the present day . 
- The tenor of his work changed as he developed his style and adapted to his surroundings ( " chameleon @-@ like " according to Watson ) : his earliest works are in a relatively derivative , courtly style , but he came into his own in the years of the rebellion . Owen comments on the " grim simplicity " of the Qinzhou poems , which mirrors the desert landscape ; the works from his Chengdu period are " light , often finely observed " ; while the poems from the late Kuizhou period have a " density and power of vision " . 
- Although he wrote in all poetic forms , Du Fu is best known for his lǜshi , a type of poem with strict constraints on form and content , for example : 
- About two thirds of Du Fu 's 1500 extant works are in this form , and he is generally considered to be its leading exponent . His best lǜshi use the parallelisms required by the form to add expressive content rather than as mere technical restrictions . Hawkes comments that , " it is amazing that Tu Fu is able to use so immensely stylized a form in so natural a manner " . 
- 
- = = Influence = = 
- 
- According to the Encyclopædia Britannica , Du Fu 's writings are considered by many literary critics to be among the greatest of all time , and it states " his dense , compressed language makes use of all the connotative overtones of a phrase and of all the intonational potentials of the individual word , qualities that no translation can ever reveal . " 
- In his lifetime and immediately following his death , Du Fu was not greatly appreciated . In part this can be attributed to his stylistic and formal innovations , some of which are still " considered extremely daring and bizarre by Chinese critics . " There are few contemporary references to him — only eleven poems from six writers — and these describe him in terms of affection , but not as a paragon of poetic or moral ideals . Du Fu is also poorly represented in contemporary anthologies of poetry . 
- However , as Hung notes , he " is the only Chinese poet whose influence grew with time " , and his works began to increase in popularity in the ninth century . Early positive comments came from Bai Juyi , who praised the moral sentiments of some of Du Fu 's works ( although he found these in only a small fraction of the poems ) , and from Han Yu , who wrote a piece defending Du Fu and Li Bai on aesthetic grounds from attacks made against them . Both these writers showed the influence of Du Fu in their own poetic work . By the beginning of the 10th century , Wei Zhuang constructed the first replica of his thatched cottage in Sichuan . 
- It was in the 11th century , during the Northern Song era that Du Fu 's reputation reached its peak . In this period a comprehensive re @-@ evaluation of earlier poets took place , in which Wang Wei , Li Bai and Du Fu came to be regarded as representing respectively the Buddhist , Daoist and Confucian strands of Chinese culture . At the same time , the development of Neo @-@ Confucianism ensured that Du Fu , as its poetic exemplar , occupied the paramount position . Su Shi famously expressed this reasoning when he wrote that Du Fu was " preeminent ... because ... through all his vicissitudes , he never for the space of a meal forgot his sovereign " . His influence was helped by his ability to reconcile apparent opposites : political conservatives were attracted by his loyalty to the established order , while political radicals embraced his concern for the poor . Literary conservatives could look to his technical mastery , while literary radicals were inspired by his innovations . Since the establishment of the People 's Republic of China , Du Fu 's loyalty to the state and concern for the poor have been interpreted as embryonic nationalism and socialism , and he has been praised for his use of simple , " people 's language " . 
- Du Fu 's popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi 's concern for the poor , Lu You 's patriotism , and Mei Yaochen 's reflections on the quotidian are a few examples . More broadly , Du Fu 's work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . 
- In the 20th century , he was the favourite poet of Kenneth Rexroth , who has described him as " the greatest non @-@ epic , non @-@ dramatic poet who has survived in any language " , and commented that , " he has made me a better man , as a moral agent and as a perceiving organism " . 
- 
- = = = Influence on Japanese literature = = = 
- 
- Du Fu 's poetry has made a profound impact on Japanese literature , especially on the literature from the Muromachi period and on scholars and poets in the Edo period , including Matsuo Bashō , the very greatest of all haiku poets . Even in modern Japanese , the term Saint of Poetry ( 詩聖 , shisei ) is mostly synonymous with Du Fu . 
- Until the 13th century , the Japanese preferred Bai Juyi above all poets and there were few references to Du Fu , although his influence can be seen in some kanshi ( " Chinese poetry made by Japanese poets " ) anthologies such as Bunka Shūreishū in the 9th century . The first notable Japanese appreciator of Du Fu 's poetry was Kokan Shiren ( 1278 – 1346 ) , a Rinzai Zen patriarch and one of the most prominent authors of the literature of the Five Mountains ; he highly praised Du Fu and made a commentary on some poems of Du Fu from the perspective of a Zen priest in Vol . 11 of Saihokushū . His student Chūgan Engetsu composed many kanshi which were clearly stated " influenced by Du Fu " in their prefaces . Chūgan 's student Gidō Shūshin had close connection with the Court and Ashikaga Shogunate and propagated Du Fu 's poetry in the mundane world ; one day Nijō Yoshimoto , the Kampaku regent of the Court and the highest authority of renga poetry , asked Gidō , " Should I learn the poetry of Du Fu and Li Bai ? " Gidō dared to reply , " Yes if you do have enough capability . No if do not . " Since then , there had been many seminars on Du Fu 's poetry both in Zen temples and in the aristocratic society , and as a result his poetry was often cited in Japanese literature in the Muromachi period , e.g. , Taiheiki , a historical epic in the late 14th century , and some noh plays such as Hyakuman , Bashō , and Shunkan . 
- During the Kan 'ei era of the Edo period ( 1624 – 1643 ) , Shào Chuán ( 邵傳 ) of the Ming Dynasty 's Collective Commentary on Du Fu 's Lǜshi ( 杜律集解 , Toritsu Shikkai ) was imported into Japan , and it gained explosive popularity in Confucian scholars and chōnin ( townspeople ) class . The commentary established Du Fu 's fame as the highest of all poets ; for instance , Hayashi Shunsai , a notable Confucian scholar , commented in Vol . 37 of Gahō Bunshū that Zǐměi [ Du Fu ] was the very best poet in history and praised Shào Chuán 's commentary for its simplicity and readability , while he criticized old commentaries during the Yuan Dynasty were too unfathomable . Matsuo Bashō , the greatest haiku poet , was also strongly influenced by Du Fu ; in Oku no Hosomichi , his masterpiece , he cites the first two lines of A Spring View ( 春望 ) before a haiku as its introduction and also many of his other haiku have similar wording and themes . It is said that when he died in Osaka during a long travel , a copy of Du Fu 's poetry was found with him as one of a few precious items which he was able to carry around . 
- 
- = = Translation = = 
- 
- A variety of styles have been used in efforts to translate Du Fu 's work into English . As Burton Watson remarks in The Selected Poems of Du Fu , " There are many different ways to approach the problems involved in translating Du Fu , which is why we need as many different translations as possible " ( p. xxii ) . The translators have had to contend with bringing out the formal constraints of the original without sounding laboured to a Western ear ( particularly when translating regulated verse , or lǜshi ) , and accommodating the complex allusions contained particularly in the later works ( Hawkes writes that " his poems do not as a rule come through very well in translation " — p. ix ) . One extreme on each issue is represented by Kenneth Rexroth 's One Hundred Poems From the Chinese . His are free translations , which seek to conceal the parallelisms through enjambement and expansion and contraction of the content ; his responses to the allusions are firstly to omit most of these poems from his selection , and secondly to " translate out " the references in those works which he does select . 
- Other translators have placed much greater weight on trying to convey a sense of the poetic forms used by Du Fu . Vikram Seth in Three Chinese Poets uses English @-@ style rhyme schemes , whereas Keith Holyoak in Facing the Moon approximates the Chinese rhyme scheme ; both use end @-@ stopped lines and preserve some degree of parallelism . In The Selected Poems of Du Fu , Burton Watson follows the parallelisms quite strictly , persuading the western reader to adapt to the poems rather than vice versa . Similarly , he deals with the allusion of the later works by combining literal translation with extensive annotation . 
- In 2015 , Stephen Owen published translations , with facing Chinese texts , of the complete poetry of Du Fu in six volumes , with extensive scholarly apparatus , which emphasized literalness . 
- 
- 
- = Kiss You ( One Direction song ) = 
- 
- " Kiss You " is a song recorded by English @-@ Irish boy band One Direction for their second studio album , Take Me Home ( 2012 ) . It was released as the record 's second single in Germany and the third overall single on 7 January 2013 . The song was composed by Kristoffer Fogelmark , Kristian Lundin , Albin Nedler , Savan Kotecha , Shellback and its producers , Carl Falk and Rami Yacoub . " Kiss You " is an upbeat power pop song with electronic effects ; the lyrics detail a protagonist 's infatuation with a significant other . Critics praised the song for its production , calling it a stand @-@ out track on Take Me Home . 
- The track became the group 's sixth top @-@ ten hit in Ireland and the United Kingdom , while attaining top @-@ forty positions in both Belgian territories ( Flanders and Wallonia ) , as well as in Australia , Canada , Denmark , France , New Zealand , and the Netherlands . The single peaked at number 46 on the US Billboard Hot 100 and has been certified gold by the Recording Industry Association of America ( RIAA ) for shipments of 500 @,@ 000 copies . One Direction performed " Kiss You " on both the UK and US versions of The X Factor and 3 major concert tours : Take Me Home Tour ( 2013 ) , Where We Are Tour ( 2014 ) and On the Road Again Tour ( 2015 ) . 
- An accompanying music video , designed to display the group 's comedic timing , was directed by Vaughan Arnell , who had previously worked with the group on two other music videos . The clip depicts the band shooting various scenes via a green screen , which include sequences reminiscent of iconic music videos of songs such as the Beach Boys ' " Surfer Girl " , Elvis Presley 's " Jailhouse Rock " and Rammstein 's " Mein Land " . The music video received 10 @.@ 4 million views in a 24 @-@ hour period and positive commentary from reviewers , who appreciated its carefree , jubilant nature . 
- The song was included in the dancing game Just Dance 2014 , and is also one of the select songs available on the demo version . Additionally , it is the final main track on the US edition of Now That 's What I Call Music ! 46 . 
- 
- = = Background and release = = 
- 
- " Kiss You " was written by Kristoffer Fogelmark , Kristian Lundin , Albin Nedler , Savan Kotecha , Shellback , and its producers , Carl Falk and Rami Yacoub . Falk , Kotecha , and Yacoub had collaboratively composed One Direction 's previous hit singles , " What Makes You Beautiful " , " One Thing " , and " Live While We 're Young " . In April 2012 , The Independent reported that Simon Cowell , the group 's manager , had challenged prominent songwriters to compete for space on One Direction 's second album . Falk said , " It 's important to get their personalities on the music . " In addition , the article reported that Syco Records was working on candidates that included Max Martin and Lundin . 
- " Kiss You " was chosen as the second US single and third international from their second studio album , Take Me Home . Liam Payne , a group member , in a November 2012 interview with MTV News , explained why they chose " Kiss You " as the album 's second single in the US . Payne was quoted as saying : " With the album , that 's the first one that we listened to and we were like , ' Yeah , we love this song ' " . According to a MTV News article , the number was released digitally in the United States on 17 November 2012 . By 18 January 2013 , the song had not been officially promoted to US radio stations . The track , however , was released by Sony Music Entertainment on 8 February 2013 , as the record 's second single in Germany . 
- 
- = = Composition and reception = = 
- 
- " Kiss You " is an uptempo , upbeat power pop song which runs for a duration of 3 : 04 ( 3 minutes , four seconds ) . The track features electronic effects , colossal hooks , a " na na na " breakdown , and a Motown @-@ tinged melody . One Direction 's vocal range in the song span from the note of E4 to C ♯ 6 . Instrumentation includes guitar strings , piano lines and vocals . Written in the key of E major , the beat is set in common time and moves at a quick 90 beats per minute , according to the digital sheet music published at Musicnotes.com by Sony / ATV Music Publishing . Likewise , Matt Collar from Allmusic noted that the track is " frenetically hyper " . The lyrical content regards the protagonist 's infatuation with a significant other , and incorporates euphemisms for sexual intercourse in the lines " If you don ’ t wanna take it slow / And you just wanna take me home / Baby say yeah , yeah , yeah , yeah , yeah . " 
- " Kiss You " was well received by contemporary music critics , who centred on its quality of production . Both Rolling Stone 's Jon Dolan , who praised its effectiveness , and Chris Payne of Billboard , who appreciated the melody , described " Kiss You " as one of the album 's highlights . Alexis Petridis for The Guardian commended the track 's chorus as " hard to dislodge from your brain " . Robert Copsey of Digital Spy noted the song 's possibility to become an international hit , applauding it sonically . A reviewer for MTV News described the track 's lyricism as " butterflies @-@ inducing " , and Sam Lansky of Idolator wrote that " Kiss You " is noticeably a stand @-@ out track on its parent album . Melinda Newman , writing for HitFix , regarded the song as " a bouncy , electronic infectious ditty , " while Chris Younie , a critic from 4Music , deemed it an " amazing pop song " , lauding the group 's falsetto and its " head @-@ banging anthemic " chorus . 
- 
- = = Commercial performance = = 
- 
- The single made its Irish Singles Chart debut at number 24 on the week ending 13 December 2012 . It peaked at number seven on the week ending 17 January 2013 , marking their sixth top ten appearance in Ireland . " Kiss You " entered at number 152 in the UK Singles Chart on 24 November 2012 . It peaked at number nine on the UK Singles Chart on 26 January 2013 , becoming One Direction 's sixth top ten hit in the United Kingdom . On the week ending 18 November 2012 , " Kiss You " debuted at number 90 on the United States Billboard Hot 100 due to digital download sales from its parent album . As a result of an " end @-@ of @-@ year download rush " on the week ending 30 December 2012 , the track re @-@ entered the Hot 100 at number 83 . After the accompanying music video was released , the song re @-@ entered the Hot 100 at number 65 . " Kiss You " had sold 207 @,@ 000 digital downloads in the US by 18 January 2013 . The single ultimately peaked at number 46 on the Hot 100 and was certified gold by the Recording Industry Association of America ( RIAA ) on 25 April 2013 , denoting shipments of 500 @,@ 000 copies . 
- The song became One Direction 's fourth top @-@ forty hit on the Canadian Hot 100 , peaking at number 30 . The single bowed at number 13 on the Australian Singles Chart on 27 January 2013 , marking its peak position and the group 's fourth top twenty hit in Australia . The song has been certified platinum by the Australian Recording Industry Association ( ARIA ) for shipments of 70 @,@ 000 copies . The track entered the New Zealand Singles Chart at number 17 on 11 January 2013 . It peaked at number 13 in its third and fourth charting weeks , becominh the group 's sixth top @-@ forty appearance in New Zealand . " Kiss You " has received a gold certification from the Recording Industry Association of New Zealand ( RIANZ ) , indicating sales of 7 @,@ 500 copies . The track also reached the top 40 in both Belgian territories ( Flanders and Wallonia ) , as well as in the Czech Republic , Denmark , France , the Netherlands , and South Korea . In addition , " Kiss You " received gold certifications from the IFPI Norway and Denmark associations , signifying collective shipments of 20 @,@ 000 units . 
- 
- = = Music video = = 
- 
- The accompanying music video , directed by Vaughan Arnell , who had previously directed One Direction 's music videos for " Live While We 're Young " and " Little Things " , was designed to showcase the group 's comedic timing . Inspired by the Beach Boys , cult surfing films , old Hollywood , and British cinema , the music video incorporates " a technicolor vibe and a British kind of romp " , as noted by Arnell in a MTV News interview . 
- Shot by November 2012 , the music video was characterised , in several MTV News interviews , as " bigger than anything we 've done before " by Zayn Malik , as " a lot of hard work " by Payne , as " pure stupidity " by Louis Tomlinson , and as " I wouldn 't say [ it 's ] comedy , it 's all tongue @-@ in @-@ cheek " by Arnell . Premiering worldwide on Vevo on 7 January 2013 , the music video depicts the band shooting different scenes via a green screen , dressed as sailors , surfers , skiers and jailers . The video features scenes reminiscent of the films South Pacific , To Catch a Thief , Jailhouse Rock and Beach Blanket Bingo , as well as the iconic music videos of songs such as The Beach Boys ' " Surfer Girl " , Elvis Presley 's " Blue Hawaii " and Rammstein 's " Mein Land " , among others . 
- The music video garnered 10 @.@ 4 million views in a 24 @-@ hour period , failing to attain the Vevo record held by Justin Bieber 's " Beauty and a Beat " music video ( 10 @.@ 6 million ) . Despite a 34 % gain in weekly activity to their Vevo channel , with the clip 's success and preceding teaser videos earning 38 million views during the week , One Direction held at number two on the Billboard 's Social 50 chart A 15 % rise in Facebook reaction gave way to a 154 @,@ 000 increase in Facebook likes during the week . 191 @,@ 000 Twitter followers added contributed to their overall fan base increase as well . 
- Melinda Newman , a contributor for HitFix , favoured the clip as having " everything a video by a boy band should be " and found group 's careless tone delightful . Rebecca Macatee of E ! Online praised its " intentionally cheesy and utterly adorable " sequences , and MTV News 's Jocelyn Vena described the clip as " conquering old Hollywood " . Molly Chance , writing for Zap2it , was convinced that upon watching the " adorable " music video , the viewer should have a hard time disliking the group . Mikael Wood , the critic for Los Angeles Times , commended the group for " having a genuinely great time " , rather than going through the motions . 
- 
- = = Live performances = = 
- 
- As part of its promotion , One Direction performed the song on televised programmes and during their worldwide Take Me Home Tour ( 2013 ) . One Direction performed the track on The Today Show at the Rockefeller Center on 13 November 2012 , to a record crowd estimated at 15 @,@ 000 . " Kiss You " was included in the set list of the group 's 3 December 2012 sold @-@ out show at New York City 's Madison Square Garden . One Direction delivered a performance of " Kiss You " , in front of a video game @-@ themed set , on the final of the ninth series of The X Factor UK on 10 December 2012 . According to the Daily Mail , their " energetic rendition " of " Kiss You " proved that the group have an elusive quality . On 12 December 2012 , the group also performed the number on the final of the second season of The X Factor USA . Considering One Direction the " franchise 's biggest success story " , an editor for The Huffington Post opined that the boy band 's prominent presence on both the US and UK versions of The X Factor seemed fitting . Not only Take Me Home Tour , they also performance in Where We Are Tour ( 2014 ) & On the Road Again Tour ( 2015 ) 
- 
- = = Track listing = = 
- 
- CD single 
- " Kiss You " – 3 : 04 
- " Little Things " – 3 : 42 
- 
- = = Credits and personnel = = 
- 
- Carl Falk — writing , production , programming , instruments , guitar , background vocals 
- Kristoffer Fogelmark — background vocals 
- Niall Horan — additional guitar 
- Savan Kotecha — writing , background vocals 
- Kristian Lundin — writing 
- Albin Nedler — writing , background vocals 
- Shellback — writing 
- Rami Yacoub — writing , production , programming , instruments , bass 
- Credits adapted from Take Me Home 's liner notes . 
- 
- = = Charts = = 
- 
- 
- = = Certifications = = 
- 
- 
- = = Release history = = 
- 
- 
- 
- = Ise @-@ class battleship = 
- 
- The Ise @-@ class battleships ( 伊勢型戦艦 , Ise @-@ gata senkan ) were a pair of dreadnought battleships built for the Imperial Japanese Navy ( IJN ) during World War I. Originally intended to be repeats of the preceding Fusō class , they were redesigned before construction began . Both ships carried supplies for the survivors of the Great Kantō earthquake in 1923 . They were modernized in 1934 – 37 with improvements to their armour and machinery and a rebuilt superstructure in the pagoda mast style . Afterwards they played a minor role in the Second Sino @-@ Japanese War . 
- Despite the expensive reconstructions , both vessels were considered obsolete by the eve of the Pacific War , and neither saw significant action in the early years of the war . Following the loss of most of the IJN 's large aircraft carriers during the Battle of Midway in mid @-@ 1942 , they were rebuilt with a flight deck replacing the rear pair of gun turrets to give them the ability to operate an air group of floatplanes . A lack of aircraft and qualified pilots , however , meant that they never actually operated their aircraft in combat . While awaiting their air group the sister ships were sometimes used to ferry troops and material to Japanese bases . They participated in the Battle of Cape Engaño in late 1944 , where they decoyed the American carrier fleet supporting the invasion of Leyte away from the landing beaches . Afterwards both ships were transferred to Southeast Asia ; in early 1945 they participated in Operation Kita , where they transported petrol and other strategic materials to Japan . The sisters were then reduced to reserve until they were sunk during American airstrikes in July . After the war they were scrapped in 1946 – 47 . 
- 
- = = Background = = 
- 
- The design of the Fusō @-@ class battleships was shaped both by the ongoing international naval arms race and a desire among Japanese naval planners to maintain a fleet of capital ships powerful enough to defeat the United States Navy in an encounter in Japanese territorial waters . The IJN 's fleet of battleships had proven highly successful in 1905 , the last year of the Russo @-@ Japanese War , which culminated in the destruction of the Russian Second and Third Pacific Squadrons at the Battle of Tsushima . 
- In the aftermath , the Japanese Empire immediately turned its focus to the two remaining rivals for imperial dominance in the Pacific Ocean : Britain and the United States . Satō Tetsutarō , a Japanese Navy admiral and military theorist , speculated that conflict would inevitably arise between Japan and at least one of its two main rivals . To that end , he called for the Japanese Navy to maintain a fleet with at least 70 % as many capital ships as the US Navy . This ratio , Satō theorized , would enable the Imperial Japanese Navy to defeat the US Navy in one major battle in Japanese waters in any eventual conflict . Accordingly , the 1907 Imperial Defence Policy called for the construction of a battle fleet of eight modern battleships , 20 @,@ 000 long tons ( 20 @,@ 321 t ) each , and eight modern armoured cruisers , 18 @,@ 000 long tons ( 18 @,@ 289 t ) each . This was the genesis of the Eight @-@ Eight Fleet Program , the development of a cohesive battle line of sixteen capital ships . 
- The launch of HMS Dreadnought in 1906 by the Royal Navy raised the stakes , and complicated Japan 's plans . Displacing 17 @,@ 900 long tons ( 18 @,@ 200 t ) and armed with ten 12 @-@ inch ( 30 @.@ 5 cm ) guns , Dreadnought rendered all existing battleships obsolete by comparison . The launch of the battlecruiser HMS Invincible the following year was a further setback for Japan 's quest for parity . When the two new Satsuma @-@ class battleships and two Tsukuba @-@ class armoured cruisers , launched by 1911 , were outclassed by their British counterparts , the Eight @-@ Eight Fleet Program was restarted . 
- The first battleships built for the renewed Eight @-@ Eight Fleet Program were the two dreadnoughts of the Kawachi class , ordered in 1907 and laid down in 1908 . In 1910 , the Navy put forward a request to the Diet ( parliament ) to secure funding for the entirety of the program at once . Because of economic constraints , only four battlecruisers and a single battleship of the Fusō class were ultimately approved by the Diet . Three more Fusō @-@ class ships ( Yamashiro , Ise , and Hyūga ) were approved and all three were ordered in April 1913 . While Yamashiro was laid down later that year , the IJN lacked the funding to proceed with the construction of Ise and Hyūga until the Diet authorized additional funding for the ships in July 1914 . 
- 
- = = Design and description = = 
- 
- The progress of Fusō 's construction , while the IJN waited for the funding to be released and foreign developments , caused the IJN to reassess the Fusō @-@ class design . The distribution of the midships gun turrets was the most obvious flaw as they complicated the protection of the midships magazine and exposed more of the ship to the blast effects of the guns when they fired . Another issue was that Japanese sailors had problems maintaining a high rate of fire with the 45 @.@ 36 @-@ kilogram ( 100 @.@ 0 lb ) shells used in the manually loaded 152 @-@ millimetre ( 6 in ) secondary guns used in the Fusō class and earlier designs . To resolve this issue , the IJN designed a smaller 140 @-@ millimetre ( 5 @.@ 5 in ) gun that offset its lighter shell weight with a higher rate of fire . It also decided that the barbette armour of the earlier ships was too thin and wanted a modest increase in speed to partially counter the higher speeds of the latest foreign ships like the British Queen Elizabeth @-@ class battleships and Russian Borodino @-@ class battlecruisers . For financial reasons more powerful engines could not be ordered so the new design was lengthened slightly and the boiler rooms enlarged to increase speed by 0 @.@ 5 knots ( 0 @.@ 93 km / h ; 0 @.@ 58 mph ) to 23 knots ( 43 km / h ; 26 mph ) . To save weight the forecastle deck was shortened so that the lower midships gun turret was lower than in the Fusō class . This reduced the crew 's accommodations despite a significant increase in the crew 's numbers and naval historian Fukui Shizuo believed that these ships had the worst habitability of any Japanese capital ship . The final design was designated A @-@ 92 by the IJN and differed enough from the A @-@ 64 design of the Fusō class that it was considered a separate class . 
- The ships had a length of 208 @.@ 18 metres ( 683 ft 0 in ) overall , a beam of 28 @.@ 65 metres ( 94 ft 0 in ) and a draught of 8 @.@ 93 metres ( 29 ft 4 in ) at deep load . They displaced 36 @,@ 500 long tons ( 37 @,@ 100 t ) at deep load , roughly 650 long tons ( 660 t ) more than the preceding class . Their crew consisted of 1 @,@ 360 officers and enlisted men . They had a metacentric height of 1 @.@ 737 metres ( 5 ft 8 @.@ 4 in ) at deep load . 
- During the ships ' modernization during the 1930s , their forward superstructures were enlarged with multiple platforms added to their tripod foremasts . Both ships were also given torpedo bulges to improve their underwater protection and to compensate for the weight of the additional armour . In addition , their sterns were lengthened by 7 @.@ 62 metres ( 25 @.@ 0 ft ) . These changes increased their overall length to 213 @.@ 8 metres ( 701 ft ) , their beam to 31 @.@ 75 metres ( 104 ft 2 in ) and their draft to 9 @.@ 45 metres ( 31 ft 0 in ) . Their displacement increased over 5 @,@ 000 long tons ( 5 @,@ 100 t ) to 42 @,@ 001 long tons ( 42 @,@ 675 t ) at deep load . The crew now numbered 1 @,@ 376 officers and enlisted men . 
- 
- = = = Propulsion = = = 
- 
- The Ise @-@ class ships had two sets of direct @-@ drive steam turbines , each of which drove two propeller shafts with 3 @.@ 429 @-@ metre ( 11 ft 3 in ) propellers . The high @-@ pressure turbines drove the wing shafts while the low @-@ pressure turbines drove the inner shafts . The turbines were designed to produce a total of 40 @,@ 000 or 45 @,@ 000 shaft horsepower ( 30 @,@ 000 or 34 @,@ 000 kW ) ( Hyūga and Ise respectively ) , using steam provided by 24 Kampon Ro Gō water @-@ tube boilers at working pressures of 13 – 16 @.@ 9 kg / cm2 ( 1 @,@ 275 – 1 @,@ 657 kPa ; 185 – 240 psi ) . Both ships comfortably exceeded their designed speed of 23 knots ( 43 km / h ; 26 mph ) during their sea trials ; Ise reached 23 @.@ 6 knots ( 43 @.@ 7 km / h ; 27 @.@ 2 mph ) from 56 @,@ 498 shp ( 42 @,@ 131 kW ) and Hyūga exceeded that with 24 knots ( 44 km / h ; 28 mph ) from 63 @,@ 211 shp ( 47 @,@ 136 kW ) . Each of the boilers consumed a mixture of coal and oil and the ships had a stowage capacity of 4 @,@ 607 long tons ( 4 @,@ 681 t ) of coal and 1 @,@ 411 long tons ( 1 @,@ 434 t ) of fuel oil , which gave them a range of 9 @,@ 680 nautical miles ( 17 @,@ 930 km ; 11 @,@ 140 mi ) at a speed of 14 knots ( 26 km / h ; 16 mph ) . Ise and Hyūga had three generators of 150 kilowatts ( 200 hp ) capacity and two 250 @-@ kilowatt ( 340 hp ) turbo generators at 225 volts . 
- During their 1930s modernization , the boilers on each ship were replaced by eight new Kampon oil @-@ fired boilers , fitted into the former aft boiler room , and the forward funnel was removed . The turbines were replaced by four geared Kampon turbines with a designed output of 80 @,@ 000 shp ( 60 @,@ 000 kW ) intended to increase their speed to 24 @.@ 5 knots ( 45 @.@ 4 km / h ; 28 @.@ 2 mph ) . On her trials , Ise reached a top speed of 25 @.@ 26 knots ( 46 @.@ 78 km / h ; 29 @.@ 07 mph ) from 81 @,@ 050 shp ( 60 @,@ 440 kW ) . The fuel storage of the ships was increased to a total of 5 @,@ 113 long tons ( 5 @,@ 195 t ) of fuel oil that gave them a range of 7 @,@ 870 nautical miles ( 14 @,@ 580 km ; 9 @,@ 060 mi ) at a speed of 16 knots ( 30 km / h ; 18 mph ) . 
- 
- = = = Armament = = = 
- 
- The twelve 45 @-@ calibre 35 @.@ 6 cm ( 14 @.@ 0 in ) Type 41 guns of the Ise class were mounted in three pairs of twin @-@ gun , superfiring turrets . Numbered one through six from front to rear , each turret weighed 655 long tons ( 666 t ) . The hydraulically powered turrets had an elevation capability of − 5 / + 20 degrees . The guns had a rate of fire of 1 @.@ 5 – 2 rounds per minute and could be loaded at any angle between -3 and + 20 degrees . In 1921 the elevation was increased to + 30 degrees and then to + 43 degrees during their mid @-@ 1930s modernization , except for No. 6 turret as its supporting structure could not be lowered . The recoil mechanism of the guns was also changed from a hydraulic to a pneumatic system , which allowed for a faster firing cycle of the main guns . 
- By World War II , the guns used Type 91 armour @-@ piercing , capped shells . Each of these shells weighed 673 @.@ 5 kilograms ( 1 @,@ 485 lb ) and was fired at a muzzle velocity of 770 – 775 metres per second ( 2 @,@ 530 – 2 @,@ 540 ft / s ) . They had a maximum range of 25 @,@ 000 metres ( 27 @,@ 000 yd ) at + 20 degrees of elevation and 35 @,@ 450 meters ( 38 @,@ 770 yd ) at + 43 degrees after modernization . Also available was a 625 @-@ kilogram ( 1 @,@ 378 lb ) high @-@ explosive shell that had a muzzle velocity of 805 metres per second ( 2 @,@ 640 ft / s ) . A special Type 3 Sanshikidan incendiary shrapnel shell was developed in the 1930s for anti @-@ aircraft use . 
- The ships ' secondary armament consisted of twenty 50 @-@ calibre 14 @-@ centimetre Type 3 . Eighteen of these were mounted in casemates in the forecastle and superstructure and the remaining pair were mounted on the deck above them and protected by gun shields . They had a maximum elevation of + 20 degrees which gave them ranges of 16 @,@ 300 metres ( 17 @,@ 800 yd ) . Each gun had a rate of fire of up to 10 rounds per minute . Anti @-@ aircraft defence was provided by four 40 @-@ calibre 3rd Year Type 8 @-@ centimetre AA guns in single mounts . The 7 @.@ 62 @-@ centimetre ( 3 in ) high @-@ angle guns had a maximum elevation of + 75 degrees , and had a rate of fire of 13 to 20 rounds per minute . They fired a 6 kg ( 13 lb ) projectile with a muzzle velocity of 680 m / s ( 2 @,@ 200 ft / s ) to a maximum height of 7 @,@ 500 metres ( 24 @,@ 600 ft ) . The ships were also fitted with six submerged 53 @.@ 3 @-@ centimetre ( 21 @.@ 0 in ) torpedo tubes , three on each broadside . They carried twelve to eighteen 6th Year Type torpedoes which had a 200 @-@ kilogram ( 440 lb ) warhead . They had three settings for range and speed : 15 @,@ 000 metres ( 16 @,@ 000 yd ) at 26 knots ( 48 km / h ; 30 mph ) , 10 @,@ 000 metres ( 11 @,@ 000 yd ) at 32 knots ( 59 km / h ; 37 mph ) , or 7 @,@ 000 metres ( 7 @,@ 700 yd ) at 37 knots ( 69 km / h ; 43 mph ) . 
- In 1931 – 33 the AA guns were replaced with eight 40 @-@ caliber 12 @.@ 7 cm ( 5 @.@ 0 in ) Type 89 dual @-@ purpose guns , fitted on both sides of the forward superstructures in four twin @-@ gun mounts . When firing at surface targets , the guns had a range of 14 @,@ 700 metres ( 16 @,@ 100 yd ) ; they had a ceiling of 9 @,@ 440 metres ( 30 @,@ 970 ft ) at their maximum elevation of + 90 degrees . Their maximum rate of fire was 14 rounds a minute , but their sustained rate of fire was around eight rounds per minute . Two twin @-@ gun mounts for license @-@ built Vickers two @-@ pounder light AA guns were also added . These guns had a maximum elevation of + 80 degrees and a rate of fire of 200 rounds per minute . The pair of 14 cm guns on the upper deck were removed at this time . 
- During the mid @-@ 1930s reconstruction the torpedo tubes were removed and the Vickers two @-@ pounders were replaced by twenty license @-@ built Hotchkiss 25 mm Type 96 light AA guns in 10 twin @-@ gun mounts . This was the standard Japanese light AA gun during World War II , but it suffered from severe design shortcomings that rendered it a largely ineffective weapon . According to historian Mark Stille , the twin and triple mounts " lacked sufficient speed in train or elevation ; the gun sights were unable to handle fast targets ; the gun exhibited excessive vibration ; the magazine was too small , and , finally , the gun produced excessive muzzle blast " . These 25 @-@ millimetre ( 0 @.@ 98 in ) guns had an effective range of 1 @,@ 500 – 3 @,@ 000 metres ( 1 @,@ 600 – 3 @,@ 300 yd ) , and an effective ceiling of 5 @,@ 500 metres ( 18 @,@ 000 ft ) at an elevation of 85 degrees . The maximum effective rate of fire was only between 110 and 120 rounds per minute because of the frequent need to change the fifteen @-@ round magazines . In addition the forward pair of 14 cm guns in the forecastle were removed at this time and the maximum elevation of the remaining guns was increased to + 30 degrees . 
- 
- = = = Protection = = = 
- 
- The Ise @-@ class ships ' waterline protective belt had a maximum thickness of 299 mm ( 11 @.@ 8 in ) of Vickers cemented armour amidships ; below it was a strake of 100 mm ( 3 @.@ 9 in ) armour . The upper armoured deck consisted of two layers of high @-@ tensile steel 55 mm ( 2 @.@ 2 in ) thick and the lower armoured deck also consisted of two layers of high @-@ tensile steel , but only 30 mm ( 1 @.@ 2 in ) thick . The sides of this deck sloped downwards to meet the bottom of the lower strake of the belt armour . The ends of the belt armour were closed off by bulkheads that ranged in thickness from 203 to 102 mm ( 8 to 4 in ) . The turrets were protected with an armour thickness of 254 mm ( 10 in ) on the face and 76 mm on the roof . The casemate armour was 149 mm ( 5 @.@ 9 in ) thick and that of the barbettes was 299 mm thick rather than the originally planned 305 mm . The sides of the conning tower were 305 mm thick . 
- The Ise class were the only Japanese battleships to place the powder magazine above the shell magazine as the IJN wished to put as much space as possible between the highly flammable propellant and mine and torpedo detonations . The danger from plunging shells at long distances was not appreciated until the fatal magazine explosions of three British battlecruisers during the 1916 Battle of Jutland graphically demonstrated the point . To further protect the magazines the depth of the double bottom was increased to a total of 3 @.@ 58 metres ( 11 ft 9 in ) underneath the barbettes and magazines . Additionally , the vessels contained 660 watertight compartments to preserve buoyancy in the event of battle damage . In addition to the torpedo bulge added when the ships were modernized , the deck armour over the machinery and magazines was increased to a total thickness of 140 mm . Inside the original skin of the ships , two torpedo bulkheads were also added and the turret roofs were increased to a total of 152 millimetres ( 6 in ) of armour . 
- 
- = = = Fire control and sensors = = = 
- 
- While the details of the ship 's fire @-@ control instruments are not fully available , it is known that the ships were fitted with a fire @-@ control director after completion . No computer was fitted at that time and data from the rangefinders had to be processed manually . Turrets 2 , 3 , and 5 were built with imported 6 @-@ metre ( 19 ft 8 in ) Bausch & Lomb rangefinders . These were felt to be inferior to the British Barr & Stroud instruments used on other ships and were removed in 1920 . They were replaced by either the British rangefinders or domestically built instruments of 6 or 8 metres ( 19 ft 8 in or 26 ft 3 in ) length . In the late 1920s the fire @-@ control systems were upgraded and additional platforms were added to the foremast to accommodate them . A pair of directors for the 12 @.@ 7 cm AA guns were added , one on each side of the forward superstructure , in the early 1930s . The fire @-@ control systems were again upgraded in the mid @-@ 1930s and directors were added for the 25 mm AA guns . Both ships had 10 @-@ metre ( 32 ft 10 in ) rangefinders installed at the top of the pagoda mast at that time . Type 21 air @-@ search radars were installed aboard the sisters in mid @-@ 1942 . 
- 
- = = = Aircraft = = = 
- 
- Ise was briefly fitted with an aircraft flying @-@ off platform for a Mitsubishi 1MF3 fighter on Turret No. 2 in 1927 . It was replaced by a platform on Turret No. 5 for a Yokosuka E1Y reconnaissance floatplane in 1928 – 29 . A catapult and a collapsible 4 @-@ tonne ( 3 @.@ 9 @-@ long @-@ ton ) crane were fitted on the stern during the mid @-@ 1930s modernization , and the ships were equipped to operate three floatplanes , although no hangar was provided . The initial Nakajima E4N2 biplanes were replaced by Nakajima E8N2 biplanes in 1938 . 
- 
- = = Conversion to hybrid carriers = = 
- 
- The sinking of the British capital ships Prince of Wales and Repulse by Japanese land @-@ based aircraft on 10 December 1941 led the IJN to realize that battleships could not operate in the face of enemy aircraft and required friendly air support to protect them . The loss of four Japanese aircraft carriers during the Battle of Midway in June 1942 severely limited the ability of the IJN to provide any air cover and alternatives were sought . Earlier proposals to convert one or more battleships into carriers had been made and rejected at the beginning of the war , but they were revived after Midway . Plans for more elaborate conversions were rejected on the grounds of expense and , most critically , time , and the IJN settled on removing the rear pair of turrets and replacing them with a flight deck equipped with two catapults to launch floatplanes . The Ise @-@ class ships were selected for the conversion because Hyūga had suffered an explosion in Turret No. 5 in early May that virtually destroyed the turret and their Turret No. 6 could not elevate to the full + 43 degrees deemed necessary for the long @-@ range engagement anticipated by the IJN . The Fusōs were scheduled to follow once the first two were completed . 
- 
- = = = Armament changes = = = 
- 
- The rear turrets , the barbettes and their supporting structures were removed beginning in early 1943 and the openings in the middle deck were covered by 152 mm plates salvaged from the turret armour . All of the 14 cm guns were removed and the casemate openings sealed off . Four additional twin 12 @.@ 7 cm mounts were added , one pair abreast the funnel and the other abreast the conning tower . The original ten twin 25 mm gun mounts were replaced by triple mounts and nine new triple mounts were added , a total of 57 guns . Two each Type 94 and Type 95 AA directors were added to control the additional guns . The ammunition for these new guns was stored in the magazines originally used for the 14 cm guns and for Turret No. 5 . During 1944 , the ships ' AA defences were reinforced with an additional dozen triple and eleven single 25 mm gun mounts , for a total of 104 barrels , and a pair of Type 13 early warning radars were added . In September six 30 @-@ round AA rocket launchers were added on the sides of the flight deck . 
- 
- = = = Flight deck arrangements = = = 
- 
- A 70 @-@ metre @-@ long ( 229 ft 8 in ) flight deck was built above the stern and stretched forward to the rebuilt aft superstructure . The flight deck was 29 metres ( 95 ft 2 in ) wide at its forward end and 13 metres ( 42 ft 8 in ) at the stern . It overhung the stern and increased the overall length of the ships to 219 @.@ 62 metres ( 720 ft 6 in ) . A pair of rotating gunpowder @-@ propelled catapults were fitted on the sides of the hull , forward of the aft superstructure where they partially restricted the arc of fire of the two amidships turrets . They could launch aircraft up to 4 @,@ 600 kilograms ( 10 @,@ 100 lb ) in weight and required 30 seconds to launch each aircraft . The flight deck had eight permanent storage positions connected by rails to the catapults and the hydraulically operated aircraft lift that brought the aircraft up from the hangar below on the trolleys used to move the floatplanes about . Two aircraft were intended to be stowed on the catapults and three more in temporary positions on the flight deck for a total of thirteen . 
- The 40 @-@ metre @-@ long ( 131 ft 3 in ) hangar was 20 metres ( 65 ft 7 in ) wide forward and 11 metres ( 36 ft 1 in ) at the rear . It was 6 metres ( 19 ft 8 in ) high and designed to stow nine aircraft . It was fitted with fire fighting foam and carbon dioxide dispensers as a result of wartime experience . The ' T ' -shaped lift was 12 @.@ 1 metres ( 39 ft 8 in ) wide at its forward end and 6 @.@ 6 metres ( 21 ft 8 in ) wide at the its aft end . It was 12 @.@ 1 metres long and had a capacity of 6 tonnes ( 5 @.@ 9 long tons ) . Petrol storage tanks with a capacity of 76 tonnes ( 75 long tons ) were installed in the former magazine of Turret No. 6 to provide each aircraft with enough fuel for three sorties . To recover the aircraft the collapsible crane formerly on the stern was moved up to the port side of the flight deck . Another crane was intended on the starboard side , but it was never fitted . 
- The ships had an air group of 11 each of Yokosuka D4Y dive bombers ( Allied reporting name " Judy " ) and Aichi E16A reconnaissance aircraft ( Allied reporting name " Paul " ) Both aircraft had development problems and neither air group ever had all of its intended aircraft . Coupled with a shortage of trained pilots , neither ship ever used its aircraft during combat . 
- 
- = = = Other changes = = = 
- 
- After the loss of the fast battleship Hiei at the Naval Battle of Guadalcanal in late 1942 to rudder damage , the IJN decided to reinforce the protection of the steering compartment and to create an auxiliary steering compartment . The protection of the former was strengthened by the addition of a concrete wall at least 1 metre ( 3 ft 3 in ) in thickness and some of the armour removed from the turrets was used to protect the latter . The double bottom below the former positions of aft turrets was converted to hold fuel oil ; this increased the ships ' endurance to 9 @,@ 500 nautical miles ( 17 @,@ 600 km ; 10 @,@ 900 mi ) at a speed of 16 knots . A pair of Type 22 surface @-@ search radars were also fitted during the conversion . 
- The removal of the secondary armament , the rear turrets and their supporting structures was generally compensated by the addition of the flight deck , hangar , AA guns and more fuel , and the metacentric height increased .23 metres ( 9 @.@ 1 in ) to 2 @.@ 81 metres ( 9 ft 3 in ) at full load as a result of the reduction in the displacement by over 2 @,@ 000 tonnes ( 2 @,@ 000 long tons ) to 40 @,@ 444 tonnes ( 39 @,@ 805 long tons ) . This also reduced the draught to 9 @.@ 03 metres ( 29 ft 8 in ) . The overhang of the flight deck at the stern increased the overall length to 219 @.@ 62 metres ( 720 ft 6 in ) and the beam was slightly reduced to 31 @.@ 71 metres ( 104 ft 0 in ) . 
- 
- = = Ships = = 
- 
- 
- = = Service = = 
- 
- Upon commissioning , the sister ships were assigned to the 1st Battleship Division of the 1st Fleet . Hyūga had an explosion in one of her main gun turrets that killed 11 men and injured 25 in 1919 ; the following year she accidentally collided with and sank a schooner , losing two crewmen . Before the start of the Pacific War , both ships frequently exercised off the coasts of the Soviet Union , Korea and China in addition to training in Japanese waters . Ise hosted Edward , Prince of Wales , and his aide @-@ de @-@ camp Lieutenant Louis Mountbatten in 1922 during the prince 's visit to Japan . In Korea Bay when the 1923 Great Kantō earthquake struck , they sailed to Kyushu where they loaded supplies from for the victims on 4 September . Together with two other battleships and a pair of light cruisers , Ise sank the destroyer Yayoi in 1926 during gunnery practice . Ise 's AA armament was upgraded in 1931 and Hyūga 's two years later . The latter ship was modernized in 1934 – 36 and Ise in 1935 – 37 , both at Kure Naval Arsenal . During the Second Sino @-@ Japanese War , the sisters frequently patrolled the Chinese coast in support of the blockade imposed by Japan . In August 1937 Hyūga ferried two battalions of Special Naval Landing Forces to Port Arthur . Three years later , she served as the flagship for the Emperor of the puppet state of Manchukuo , Henry Pu @-@ yi , during his state visit to Japan in June 1940 . On 15 November the ships were transferred to the 2nd Battleship Division of the 1st Fleet . The sisters were refitted in late 1940 in preparation for war , which included the fitting of external degaussing coils and additional AA directors . 
- 
- = = = World War II = = = 
- 
- When Japan began the Pacific War on 8 December , the sisters sortied for the Bonin Islands with four other battleships and the light carrier Hōshō as distant cover for the fleet attacking Pearl Harbor , and returned six days later . On 11 March 1942 Ise and Hyūga sortied from their anchorage at Hashirajima to join the unsuccessful search for the American carrier force that had attacked Marcus Island a week earlier . Similarly they pursued but did not catch the American carriers that had launched the Doolittle Raid on 18 April . 
- During gunnery training on 5 May , there was a premature detonation in the left gun of Hyūga 's Turret No. 5 that disabled both guns and killed 51 crewmen . Both aft magazines were flooded to douse the resulting fire and save the ship . She received temporary repairs during which the turret was removed and replaced by a circular armour plate on which three triple 25 mm gun mounts were positioned . On 11 May a valve in Ise 's No. 2 engine room stuck in the open position and flooded the engine room . While under repair at Kure , both ships received prototype Type 21 radars . Commanded by Vice @-@ Admiral Shirō Takasu , the 2nd Battleship Division set sail with the Aleutian Support Group on 28 May , at the same time that most of the Imperial Fleet began an attack on Midway Island ( Operation MI ) . 
- They returned home on 14 June and the IJN began preliminary planning to replace the lost carriers with hybrid carriers converted from battleships . The sisters were selected for conversion and detached from the division on 14 July in preparation . They remained on " standby alert " until the actual conversions began . Ise was converted at Kure Naval Arsenal from 23 February to 5 September 1943 and Hyūga at Sasebo Naval Arsenal from 2 May to 30 November . 
- After completing her sea trials , Ise was attached to the Imperial Japanese Naval Academy at Etajima and ferried troops and munitions to the naval base at Truk in October . In November the ship began working up , joined by the newly completed Hyūga the following month , and both rejoined the 2nd Battleship Division . On 1 May 1944 , the sisters were transferred to Rear Admiral Matsuda Chiaki 's reformed Fourth Carrier Division of the 3rd Fleet . The division 's 634th Naval Air Group was formed that same day and conducted its first catapult launches in late June . 
- 
- = = = = Battle of Cape Engaño = = = = 
- 
- Shortages of aircraft and serviceability problems greatly retarded pilot training and the ships only had a total of 17 D4Ys and 18 E16As on hand on 1 October ; of these , only 6 and 16 were operational , respectively . The Japanese plan for the defence of the Philippines envisioned that the surviving carriers would be used to lure the American carrier forces away from the invasion area to a position where the carriers could be attacked by land @-@ based aircraft and the transports by the rest of the IJN . The other carrier air groups were not in much better shape and the Japanese decided to retain the aircraft ashore for use against the American carriers . The Fourth Carrier Division was assigned to the Northern Force under the command of Vice Admiral Jisaburō Ozawa and the sisters sailed from Yashima on 20 October . On the morning of 24 October , the bulk of the few aircraft aboard were launched to attack the American carriers as a distraction . They inflicted no damage and caused the Americans to search in the direction from which they had attacked . The Americans finally spotted the Japanese carriers at 16 : 40 , some 200 miles ( 320 km ) east of Cape Engaño , the northeastern tip of Luzon . The American carriers were spread out and it was very late in the day to launch an airstrike , so Admiral William Halsey , commander of the Third Fleet decided to mass his carriers in a position to attack the following morning . Ozawa reversed course during the night , correctly believing that the Americans would follow him north . 
- Although they had lost contact during the night , the Americans did find the Japanese carriers at 07 : 35 . They had already launched an airstrike of 180 aircraft that was orbiting 50 miles ( 80 km ) ahead of the American carriers while waiting for the Japanese ships to be located . This was just the first of a total of five airstrikes that the Americans launched that day . The sisters were not heavily engaged by the early airstrikes which are focusing on the group 's aircraft carriers . Ise claimed to have shot down five attacking dive bombers from the second wave and one small bomb detonated on Turret No. 2 . Hyūga was lightly damaged by near misses that rupture some hull plating in her bulge and pepper her superstructure with splinters . She took on a 5 @-@ degree list that was quickly corrected before she was ordered to tow the crippled carrier Chiyoda to safety . Her attempt was unsuccessful and Chiyoda had to be abandoned to her fate . 
- Ise was attacked by 80 @-@ odd aircraft from the fourth wave , but they failed to inflict any serious damage . She dodged 11 torpedoes and was only hit by a bomb once , on the bulge outboard of the port catapult . Some 34 other bombs near missed her , spraying her with splinters and ruptured some hull plates that contaminated some fuel oil and caused leaks in her port boiler rooms . While an exact total of her casualties is not available , it has been estimated that 5 men were killed and some 111 – 121 crewmen were wounded during this attack . Hyūga was unsuccessfully attacked by an American submarine at 18 : 43 . Around 19 : 00 Ozawa learned about a force of destroyers and cruisers that drove off the Japanese destroyers rescuing survivors from some of the carriers lost earlier in the day and sank Chiyoda . He ordered the Fourth Carrier Division to reverse course and engage the Americans , but the battleships were unable to find them , and Ozawa ordered them to reverse course and head for Amami Ōshima . When they arrived on 27 October , Ozawa transferred to Hyūga and hoisted his flag aboard her . While en route for Kure , the division was unsuccessfully attacked by another submarine . 
- In early November the catapults were removed from both ships , and they loaded troops and munitions later that month . While en route they were diverted to the Spratly Islands upon reports of heavy air raids at Manila . After off @-@ loading their cargo , they sailed for Lingga Island , near Singapore , on 20 November . They transferred to Cam Ranh Bay , French Indochina and Hyūga became flagship of the 5th Fleet there on 14 December . The division sailed for Singapore on 30 December and Vice Admiral Kiyohide Shima transferred his flag to the light cruiser Ōyodo on arrival there the following day . The division continued onwards to Lingga . Its planned return to Japan was delayed by attacks by the American Third Fleet on targets in Indochina and southern China that sank two oil tankers that were intended to refuel the division . 
- The IJN then decided to use the sisters and their escorts to bring a load of petrol , rubber , tin and other strategic minerals back to Japan after the American carriers departed the South China Sea ( Operation Kita ) . They loaded their cargoes beginning on 6 February at Singapore and departed four days later . Also carrying some 1 @,@ 150 oilfield workers , they were escorted by Ōyodo and three destroyers . Decrypted Japanese radio signals revealed the Japanese plan to the Allies , and 15 submarines were positioned along their anticipated route in an attempt to intercept and sink the ships . An additional 11 were moved into position while the group was en route , but only three were ultimately able to attack . None of them were successful before the Japanese reached Kure on 20 February . The Fourth Carrier Division was disbanded on 1 March and the sisters were reduced to 1st rank reserve ships . On 19 March Kure was attacked by aircraft from Task Force 58 and Hyūga was hit three times by bombs that killed 37 men and wounded 52 . Her gunners claimed to have shot down one American dive bomber during the attack . Ise was hit twice during the attack , but her casualties , if any , are unknown . 
- The ships were turned into floating AA batteries over the next several months although it availed them little when they were attacked again by American carrier aircraft in July . On the 24th Ise was struck by five bombs and near missed multiple times ; all told she lost 50 crewmen killed and many others wounded . The bombs started numerous leaks and Ise began to settle by the bow , although she was returned to an even keel after three @-@ days pumping . Hyūga was a primary focus of the attack and she received 10 direct hits and up to 30 near misses . She was badly damaged with some 200 @-@ odd crewmen killed and 600 wounded during the attack . She slowly foundered over the next two days and was not attacked when the Americans returned four days later . This time it was Ise 's turn and she was struck 11 or more times with many near misses that put her on the bottom in shallow water with a 15 degree list . The sisters were struck off the Navy List in November and their wrecks were scrapped after the war . 
- 
- 
- = Dick Rifenburg = 
- 
- Richard Gale " Dick " Rifenburg ( August 21 , 1926 – December 5 , 1994 ) was an American football player and a pioneering television broadcaster for the forerunner to WIVB @-@ TV in Buffalo . He played college football for the University of Michigan Wolverines in 1944 and from 1946 to 1948 . He was a consensus selection at end on the 1948 College Football All @-@ America Team . Rifenburg played professionally in the National Football League ( NFL ) with the Detroit Lions for one season in 1950 . After retiring from football he settled in Buffalo and became a sports broadcaster . He worked as a color commentator and as a play @-@ by @-@ play announcer for the Buffalo Bulls . He hosted various television and radio sports shows and was eventually inducted into the Buffalo Broadcasters Hall of Fame . 
- In college , he led the Big Ten Conference in single season receptions during his senior year and set Michigan Wolverines receptions records for both career touchdown and single @-@ season touchdowns . He had also been a Michigan High School Athletic Association ( MHSAA ) state champion in both basketball and track and field . His college career was interrupted by World War II service , and his high school career was also affected by the war due to the MHSAA 's cancellation of state championships in all sports in 1943 . 
- 
- = = High school = = 
- 
- Rifenburg was born in Petoskey , Michigan , and raised in Kalamazoo , Michigan before his family moved to Saginaw , Michigan . Rifenburg was a star athlete at Saginaw 's Arthur Hill High School in football , basketball , and track and field . In 1943 , Michigan canceled boys high school tournaments in all sports due to World War II , and they did not return until the fall of 1944 . In 1944 , he led Arthur Hill High to the MHSAA Class A high school basketball championship ( over Kalamazoo Central High School ) , scoring 24 points , including 17 in the second half , of the championship game . Rifenburg was also the state champion in 1944 in both the shot put 46 feet 11 inches ( 14 @.@ 30 m ) and high jump 5 feet 8 @.@ 5 inches ( 1 @.@ 74 m ) . He also led Arthur Hill in football , and his high school accomplishments are featured in Glory : The history of Saginaw County sports by Jack Tany ( ASIN B0006RH9Z6 ) , which is a book on high school sports in Saginaw County , Michigan . Rifenburg was named All State in football , basketball and track . 
- It is ironic that Rifenburg was born in Petoskey , Michigan in 1926 for several reasons . Ted Petoskey preceded Rifenburg as an All @-@ American end on the University of Michigan football team . Petoskey had excelled as a representative of Saginaw County in MHSAA competition . Petoskey posted significant football accomplishments in 1926 making 1926 a significant year for himself as well . Achieving All @-@ American status as an end at Michigan would be Rifenburg 's next step after excelling in MHSAA competition . 
- 
- = = College = = 
- 
- In the fall of 1944 , Rifenburg enrolled at the University of Michigan . The United Press syndicate ran a feature article about Rifenburg in September 1944 that opened as follows : " Another great end has made his appearance on the Big Ten football horizon in the person of Dick Rifenburg , 18 @-@ year @-@ old Michigan freshman . Every so often a great offensive end comes along , a player who has to learn how to play defense , but who has the natural speed , smooth actions , height and big hands that is the mark of an outstanding pass receiver . Rifenburg has laid claim to that rating . A loose @-@ limbed 180 @-@ pound freshman from Saginaw , Mich . , Rifenberg is being boomed as the Big Ten 's next ' freshman sensation . ' " As a freshman , he caught two touchdown passes in his first college football game against Iowa . In an article titled " Teens and TNT , " Time reported on Rifenburg 's performance : " Of the few teams already in action , Michigan 's teens rang the freshman bell loudest last week by winning their opener , 12 -to @-@ 7 , against the strong Iowa Seahawks ( Naval Pre @-@ Flight ) ; 6 @-@ ft . 4 Freshman End Dick Rifenburg caught passes and ran for both Michigan touchdowns . " 
- Rifenburg 's college career was interrupted by World War II service in the United States Navy , but after missing the 1945 season , he returned to play for the Wolverines from 1946 to 1948 . Rifenburg played for the Wolverines in consecutive undefeated National Championship seasons in 1947 and 1948 . He started nine games for the 1947 team . The 1947 team referred to as " Michigan 's Mad Magicians " is considered to be the greatest University of Michigan football team of all time . Rifenberg and teammate Len Ford had the reputation as the team practical jokers . During the 1947 game against Wisconsin , Rifenburg started calling signals for the Badgers . Wisconsin 's offense protested to officials , who " prowled the Wolverines secondary but never caught their man . " Rifenburg continued to scramble Badger signals , as Rifenburg 's teammates laughed at his scheme . In the January 1 , 1948 Rose Bowl that season , Michigan rolled to a 49 – 0 victory over USC , and they outgained the Trojans 491 yards to 133 . Rifenburg caught a 29 @-@ yard pass for the game 's final score . 
- In the 1948 championship season , Rifenburg scored eight touchdowns , caught 22 passes , and gained 610 yards ( 508 receiving and 102 rushing ) . Rifenburg was the second highest scoring end in the nation in 1948 , and he was a consensus All @-@ American as a senior , being selected as first team on nine of the 11 All @-@ American teams . Rifenburg led the Big Ten in receptions . 
- Although Rifenburg finished fourth among midwestern Heisman voters in 1948 , he did not finish among the top eight . By comparison , Notre Dame end Leon Hart won the Heisman Trophy in 1949 but made only eight of the 11 All @-@ American teams . It is not clear why Rifenburg did not finish higher . However , it is fairly clear that sportswriters of that era had a bias against Michigan . In the Associated Press poll at the end of the 1947 season , the Notre Dame Fighting Irish were ranked ahead of the University of Michigan , though both teams were undefeated . Some noted that every Southern AP voter had voted for Notre Dame , which had yet to integrate , whereas three of Michigan 's star players ( Bob Mann , Gene Derricotte , and Len Ford ) were African @-@ American . The Southern schools refused even to schedule games against schools that played African @-@ American players . 
- Rifenburg was considered one of the greatest Wolverine 's of the 1940s . In four seasons with the Michigan Wolverines , Rifenburg played in 32 games and had over 1 @,@ 000 yards of total offense . Rifenburg held the University of Michigan 's single season and career record for touchdown receptions ( eight in a season ; sixteen career ) until his records were broken by Anthony Carter in 1980 . 
- 
- = = Professional career = = 
- 
- In 1948 , Rifenburg was drafted by the Philadelphia Eagles in the 15th round of the NFL draft , and he was also drafted by the New York Yankees of the All @-@ America Football Conference . He had intended to play in 1949 with the Yankees , but suffered a knee injury in a practice session for the August 1949 Chicago College All @-@ Star Game . Press accounts at the time noted that the injury " will probably keep him out of pro football all season , if not forever . " The incident led to a debate as to whether NFL owners should " bar their men from playing with the college all @-@ stars . " 
- Rifenburg landed a job at WJR radio in Detroit , but he left his sportscaster 's job to join the Detroit Lions . In the 1950 NFL season , Rifenburg came back from his injury to play for the Detroit Lions . He played in 12 games and had ten receptions for 96 yards and one touchdown for the 1950 Lions . Rifenburg recalled that his playing time with the Lions was limited because the Lions also signed 1949 Heisman Trophy winner Leon Hart , who played the same position . 
- In May 1951 , he announced he was retiring from professional football to become sports director at a radio station in Buffalo . He was hired as a sportscaster by WBEN ( now known as WIVB ) , which had just started the first television station in Buffalo and the only one serving Southern Ontario . This was an early foray into television by the Buffalo Evening News . In the 1950s , Rifenburg hosted a popular panel show called " Let 's Talk Sports " in Buffalo and also pioneered an early morning exercise program . He also worked for WBEN ( AM ) and WBEN ( FM ) and as the sideline announcer for Buffalo Bills games along with Van Miller , the long time Bills play @-@ by @-@ play announcer . In addition , he served as the play @-@ by @-@ play announcer for the University of Buffalo Bulls football team . As a radio broadcaster , he is remembered for things ranging from ski reports , to 17 years worth of " Breakfast At — " programs live from various local restaurants , to 27 years as the WBEN @-@ AM All Night Show host . 
- After 30 years with WBEN and a change in ownership for the station , his show was replaced with the Mutual Network 's The Larry King Show . In the 1980s , Rifenburg taught communications at Buffalo 's Medaille College and served as a disc jockey on Public Broadcasting 's radio station WEBR ( now WDCZ ) . He also sold ads for Buffalo Evening News competitor , Buffalo Courier @-@ Express . Rifenburg 's final employer was Erie County , who hired him as an inmate training supervisor at the Erie County Correctional Facility . 
- Rifenburg was posthumously inducted into the Buffalo Broadcasters Hall of Fame in September 2007 . He was given the Golden Age Award which is reserved for " those who did it first , the people who had no pattern to follow . " The Hall of Fame award was presented to Rifenburg 's wife , Jane . In her acceptance speech , Jane Rifenburg observed that despite all of her late husband 's achievements , there was one thing he had never received : " He had a great career , but he never had a trophy . And now he has . " 
- 
- = = Family = = 
- 
- Rifenburg lived 37 of his years in Buffalo . His wife , the former Jane Morris , was the head of the Buffalo Jills cheerleaders when they met . Rifenburg , who was survived by three sons , ( Douglas A. , Gary R. , and Bruce R. ) one daughter ( Wendy J. Colf ) and two grandchildren , died in Cheektowaga , New York in December 1994 ; he was 68 years old . Doug was a 1988 first team football All @-@ Western New York linebacker for Clarence High School . 
- 
- 
- = 1933 Treasure Coast hurricane = 
- 
- The 1933 Treasure Coast hurricane was the second @-@ most intense tropical cyclone to strike the United States during the active 1933 Atlantic hurricane season . The eleventh tropical storm , fifth hurricane , and the third major hurricane of the season , it formed east @-@ northeast of the Leeward Islands on August 31 . The tropical storm moved rapidly west @-@ northwestward , steadily intensifying to a hurricane . It acquired peak winds of 140 miles per hour ( 225 km / h ) and passed over portions of the Bahamas on September 3 , including Eleuthera and Harbour Island , causing severe damage to crops , buildings , and infrastructure . Winds over 100 mph ( 161 km / h ) affected many islands in its path , especially those that encountered its center , and many wharves were ruined . 
- Subsequently , it weakened and made landfall at Jupiter , Florida , early on September 4 with winds of 125 mph ( 201 km / h ) . The hurricane moved across the state , passing near Tampa before moving into Georgia and dissipating . In Florida , the strong winds of the cyclone blew buildings off their foundations , and numerous trees were prostrated in citrus groves . The Treasure Coast region received the most extensive destruction , and Stuart , Jupiter , and Fort Pierce were heavily damaged . Inland , the cyclone weakened rapidly but produced prodigious amounts of rain , causing a dam to collapse near Tampa . The storm caused $ 3 million in damage ( 1933 USD ) after damaging or destroying 6 @,@ 848 homes . 
- Unusually , the storm hit Florida less than 24 hours before another major hurricane bearing 125 @-@ mile @-@ per @-@ hour ( 201 km / h ) winds struck South Texas ; never have two major cyclones hit the United States in such close succession . 
- 
- = = Meteorological history = = 
- 
- The origins of the hurricane were from a tropical wave that possibly spawned a tropical depression on August 27 , although there was minimal data over the next few days as it tracked to the west @-@ northwest . On August 31 , a nearby ship reported gale force winds , which indicated that a tropical storm had developed to the east @-@ northeast of the Lesser Antilles . Based on continuity , it is estimated the storm attained hurricane status later that day . Moving quickly to the west @-@ northwest , the storm passed north of the Lesser Antilles and Puerto Rico . Early on September 2 , a ship called the Gulfwing reported a barometric pressure of 978 mbar ( 28 @.@ 88 inHg ) , which confirmed that the storm attained hurricane status . After passing north of the Turks and Caicos islands , the hurricane struck Eleuthera and Harbour Island in the Bahamas on September 3 , the latter at 1100 UTC . A station on the latter island reported a pressure of 27 @.@ 90 inHg ( 945 mb ) during the 30 minute passage of the eye . Based on the pressure and the small size of the storm , it is estimated the hurricane struck Harbour Island with peak winds of 140 mph ( 225 km / h ) , making it the equivalent of a modern Category 4 hurricane on the Saffir @-@ Simpson scale . Interpolation suggested that the storm reached major hurricane status , or Category 3 status , on September 2 . 
- The hurricane initially followed the course of another hurricane that passed through the area in late August , which ultimately struck Cuba and Texas . This hurricane instead maintained a general west @-@ northwest track . After moving through the northern Bahamas , the hurricane weakened slightly before making landfall at Jupiter , Florida , at 0500 UTC on September 4 . A station there reported a pressure of 27 @.@ 98 inHg ( 948 mb ) during a 40 minute period of the eye 's passage ; this suggested a landfall strength of 125 mph ( 201 km / h ) . At the time , the radius of maximum winds was 15 mi ( 24 km ) , which was smaller than average . After landfall , the hurricane weakened rapidly while crossing the state . It briefly emerged into the Gulf of Mexico as a tropical storm early on September 5 . A few hours later while continuing to the northwest , it made another landfall near Rosewood — a ghost town in Levy County , east of Cedar Key — with winds of about 65 mph ( 105 km / h ) . Turning to the north , the storm slowly weakened as it crossed into Georgia , dissipating on September 7 near Augusta . 
- 
- = = Preparations and impact = = 
- 
- On September 2 , a fleet of eight aircraft evacuated all white residents from West End , Grand Bahama , to Daytona Beach , Florida . While the storm was near peak intensity on September 3 , the Weather Bureau issued hurricane warnings from Miami to Melbourne , Florida , with storm warnings extending northward to Jacksonville . Later that day , storm warnings , were issued from Key West to Cedar Key . About 2 @,@ 500 people evacuated by train from areas around Lake Okeechobee . By evening on September 3 , high tides sent sea spray over coastal seawalls in Palm Beach County as residents boarded up buildings ; structures on Clematis Street in West Palm Beach were said to be a " solid front " of plywood . Along the coast , observers reported very rough seas as the eye neared land . 
- The powerful hurricane moved over or near several islands in the Bahamas . Winds on Spanish Wells and Harbour Island were both estimated at around 140 mph ( 225 km / h ) . Winds reached 110 mph ( 177 km / h ) at Governor 's Harbour , 100 mph ( 161 km / h ) on Eleuthera , and 120 mph ( 193 km / h ) on the Abaco Islands . The storm was farther away from Nassau , where winds reached 61 mph ( 98 km / h ) . The hurricane damaged a lumber mill on Abaco , washing away a dock . Heavy damage occurred on Harbour Island , including to several roofs , the walls of government buildings , and the water system . The hurricane destroyed four churches and 37 houses , leaving 100 people homeless . A 1 @.@ 5 mi ( 2 @.@ 4 km ) road on Eleuthera was destroyed . Several islands sustained damage to farms , including the total loss of various fruit trees on Russell Island . Despite Category 4 winds on Spanish Wells , only five houses were destroyed , although most of the remaining dwellings lost their roofs . Collectively between North Point , James Cistern , and Gregory Town on Eleuthera , the storm destroyed 55 houses and damaged many others . On Grand Bahama , where a 9 to 12 ft ( 2 @.@ 7 to 3 @.@ 7 m ) storm surge was reported , half of the houses were destroyed , as were 13 boats and two planes , and most docks were wrecked . 
- When the storm moved ashore in Florida , winds reached an estimated 125 mph ( 201 km / h ) in Jupiter ; these occurred after the eye passed . In West Palm Beach , anemometers measured at least 80 @-@ mile @-@ per @-@ hour ( 129 km / h ) winds with gusts to 100 mph ( 161 km / h ) ; barometers ranged from 28 @.@ 64 to 28 @.@ 78 inHg ( 970 to 975 mb ) . The storm produced the strongest winds in the city since the 1928 Okeechobee hurricane . Winds were not as strong farther from the center ; 40 to 45 mph ( 64 to 72 km / h ) winds were observed in Miami to the south , Titusville to the north , and Tampa on the west coast . Fort Pierce estimated peak winds of 80 to 90 mph ( 129 to 145 km / h ) , and pressures dipped to 29 @.@ 14 inHg ( 987 mb ) . Inland , winds near Lake Okeechobee peaked at only 60 mph ( 97 km / h ) . The hurricane dropped heavy rainfall along its path , peaking at 17 @.@ 8 in ( 450 mm ) in Clermont . 
- At West Palm Beach , the majority of the damage was confined to vegetation . Several coconut and royal palms that withstood the 1928 hurricane snapped , littering streets with broken trunks . Winds downed road signs on many streets , and floodwaters covered the greens on a local golf course . Some garages and isolated structures , mostly lightweight , were partly or totally destroyed , along with a lumber warehouse . Some homes that lost roofing shingles had water damage to their interiors as well . Nearby Lake Worth sustained extensive breakage of windows , including plate glass , and loss of tile and shingle roofing , but preparations reduced losses to just several thousand dollars , and no post @-@ storm accidents took place . Strong winds snapped many light poles in the city , and trees and shrubs were broken or uprooted . As in Lake Worth , officials in West Palm Beach credited preparations and stringent building codes with reducing overall damage . The city had learned from previous experience with severe storms in 1926 , 1928 , and 1929 . High tides eroded Ocean Boulevard at several spots and disrupted access to several bridges on the Lake Worth Lagoon . Winter estates and hotels on Palm Beach generally sustained little material damage , except to vegetation , and county properties went largely unscathed . 
- In Martin and St. Lucie counties , the storm was considered among the worst on record . The storm leveled some homes and swept many others off their foundations . At Stuart , winds removed or badly damaged 75 % of the roofs in town . The storm destroyed the third floor of the building that housed a bowling alley and the Stuart News , a local newspaper . At Olympia , an abandoned settlement also known as Olympia Beach , strong winds leveled the old Olympia Inn , a gas station , and the second floor of a pharmaceutical building . Winds also tore the roof off an ice plant . A bridge leading to the barrier island from Olympia was partly wrecked ; the bridge tender survived by gripping the railing during the storm . Winds leveled his nearby home . According to the Monthly Weather Review , some of the most severe damage from the storm in Florida was at Olympia . The storm left many homes in Hobe Sound uninhabitable , forcing crews to tear them down . Winter estates on the island , however , were better built and little damaged . While Stuart and Hobe Sound sustained significant damage , Port Salerno suffered minimally . In Stuart , the storm left 400 to 500 people homeless , up to nearly 10 % of the population , which was 5 @,@ 100 at the time . Between Jupiter and Fort Pierce , the storm knocked down power and telegraph lines . In the latter city , high waves washed out a portion of the causeway . In the 1980s , an elderly resident recalled that the storm was the most severe on record in Fort Pierce . 
- Crop damage was worst along the Indian River Lagoon ; several farms in Stuart experienced total losses , and statewide , 16 % of the citrus crop , or 4 million boxes , were destroyed . Many chicken coops in Stuart were destroyed , and the local chicken population was scattered and dispersed as far as Indiantown . Across southeastern Florida , the hurricane damaged 6 @,@ 465 houses and destroyed another 383 , causing over $ 3 million in damage . One person , an African American farm worker , was killed when his shack blew down in Gomez , a brakeman died after seven railcars derailed , and a child was killed by airborne debris . 
- High rainfall caused flooding across Florida , notably near Tampa where waters reached 9 ft ( 2 @.@ 7 m ) deep . High rainfall of over 7 in ( 180 mm ) caused a dam operated by Tampa Electric Co. to break 3 mi ( 4 @.@ 8 km ) northeast of Tampa along the Hillsborough River . The break resulted in severe local damage , flooding portions of Sulphur Springs . Workers attempted to save the dam with sandbags , and after the break , most residents in the area were warned of the approaching flood . Over 50 homes were flooded , forcing about 150 people to evacuate . Outside Florida , the storm produced winds of 48 and 51 mph ( 78 and 81 km / h ) in Savannah , Georgia and Charleston , South Carolina , respectively . In the latter city , the storm spawned a tornado , which caused about $ 10 @,@ 000 in property damage . Heavy rainfall occurred along the Georgia and South Carolina coasts , reaching over 12 in ( 300 mm ) . Light rainfall also extended into North Carolina . 
- 
- = = Aftermath = = 
- 
- In the Bahamas after the storm , a boat sailed from Nassau to deliver food and building materials to Eleuthera . 
- After the storm , the National Guard offered shelters for at least 400 homeless residents in Stuart . Of the 7 @,@ 900 families adversely affected by the hurricane , 4 @,@ 325 required assistance from the American Red Cross . Farmers in Texas , also affected by a major hurricane , requested growers in Florida wait 15 days so they could sell their citrus crop that fell . The damaged dam near Tampa initially resulted in waters from the Hillsborough River being pumped into the city 's water treatment plant , and a new dam was eventually built in 1944 . 
- 
- 
- = Second Battle of Naktong Bulge = 
- 
- The Second Battle of Naktong Bulge was an engagement between United Nations ( UN ) and North Korean ( NK ) forces early in the Korean War from September 1 to September 15 , 1950 , along the Naktong River in South Korea . It was a part of the Battle of Pusan Perimeter , and was one of several large engagements fought simultaneously . The battle ended in a victory for the United Nations after large numbers of United States ( US ) and Republic of Korea ( ROK ) troops repelled a strong North Korean attack . 
- After the First Battle of Naktong Bulge , the US Army 's 2nd Infantry Division was moved to defend the Naktong River line . The division , which was untried in combat , was struck with a strong attack by several divisions of the Korean People 's Army which crossed the river and struck all along the division 's line . The force of the attack split the US 2nd Infantry Division in half , and the North Koreans were able to penetrate to Yongsan , promoting a fight there . 
- The urgency of the threat to Pusan Perimeter prompted the US Marine Corps 1st Provisional Marine Brigade to be brought in to reinforce the US Army troops . In two weeks of heavy fighting , the US forces were able to force the North Koreans out of the Naktong Bulge region . The North Koreans were further repulsed after the UN counterattack at Inchon , which culminated in the virtual destruction of the North Korean army . 
- 
- = = Background = = 
- 
- 
- = = = Pusan Perimeter = = = 
- 
- From the outbreak of the Korean War and the invasion of South Korea by the North , the North Korean People 's Army had enjoyed superiority in both manpower and equipment over both the Republic of Korea Army and the United Nations forces dispatched to South Korea to prevent it from collapsing . The North Korean strategy was to aggressively pursue UN and ROK forces on all avenues of approach south and to engage them aggressively , attacking from the front and initiating a double envelopment of both flanks of the unit , which allowed the North Koreans to surround and cut off the opposing force , which would then be forced to retreat in disarray , often leaving behind much of its equipment . From their initial June 25 offensive to fights in July and early August , the North Koreans used this strategy to effectively defeat any UN force and push it south . However , when the UN forces , under the Eighth United States Army , established the Pusan Perimeter in August , the UN troops held a continuous line along the peninsula which North Korean troops could not flank , and their advantages in numbers decreased daily as the superior UN logistical system brought in more troops and supplies to the UN army . 
- When the North Koreans approached the Pusan Perimeter on August 5 , they attempted the same frontal assault technique on the four main avenues of approach into the perimeter . Throughout August , the NK 6th Division , and later the NK 7th Division engaged the US 25th Infantry Division at the Battle of Masan , initially repelling a UN counteroffensive before countering with battles at Komam @-@ ni and Battle Mountain . These attacks stalled as UN forces , well equipped and with plenty of reserves , repeatedly repelled North Korean attacks . North of Masan , the NK 4th Division and the US 24th Infantry Division sparred in the Naktong Bulge area . In the First Battle of Naktong Bulge , the North Korean division was unable to hold its bridgehead across the river as large numbers of US reserve forces were brought in to repel it , and on August 19 , the NK 4th Division was forced back across the river with 50 percent casualties . In the Taegu region , five North Korean divisions were repulsed by three UN divisions in several attempts to attack the city during the Battle of Taegu . Particularly heavy fighting took place at the Battle of the Bowling Alley where the NK 13th Division was almost completely destroyed in the attack . On the east coast , three more North Korean divisions were repulsed by the South Koreans at P 'ohang @-@ dong during the Battle of P 'ohang @-@ dong . All along the front , the North Korean troops were reeling from these defeats , the first time in the war their strategies were not working . 
- 
- = = = September push = = = 
- 
- In planning its new offensive , the North Korean command decided any attempt to flank the UN force was impossible thanks to the support of the UN navy . Instead , they opted to use frontal attack to breach the perimeter and collapse it as the only hope of achieving success in the battle . Fed by intelligence from the Soviet Union the North Koreans were aware the UN forces were building up along the Pusan Perimeter and that it must conduct an offensive soon or it could not win the battle . A secondary objective was to surround Taegu and destroy the UN and ROK units in that city . As part of this mission , the North Korean units would first cut the supply lines to Taegu . 
- On August 20 , the North Korean commands distributed operations orders to their subordinate units . The North Koreans called for a simultaneous five @-@ prong attack against the UN lines . These attacks would overwhelm the UN defenders and allow the North Koreans to break through the lines in at least one place to force the UN forces back . Five battle groupings were ordered . The center attack called for the NK 9th Division , NK 4th Division , NK 2nd Division , and NK 10th Division break through the US 2nd Infantry Division at the Naktong Bulge to Miryang and Yongsan . 
- 
- = = Battle = = 
- 
- During the North Koreans ' September 1 offensive , the US 25th Infantry Division 's US 35th Infantry Regiment was heavily engaged in the Battle of Nam River north of Masan . On the 35th Regiment 's right flank , just north of the confluence of the Nam River and the Naktong River , was the US 9th Infantry Regiment , US 2nd Infantry Division . There , in the southernmost part of the 2nd Infantry Division zone , the 9th Infantry Regiment held a sector more than 20 @,@ 000 yards ( 18 @,@ 000 m ) long , including the bulge area of the Naktong where the First Battle of Naktong Bulge had taken place earlier in August . Each US infantry company on the river line here had a front of 3 @,@ 000 feet ( 910 m ) to 4 @,@ 000 feet ( 1 @,@ 200 m ) and they held only key hills and observation points , as the units were extremely spread out along the wide front . 
- During the last week of August , US troops on these hills could see minor North Korean activity across the river , which they thought was North Koreans organizing the high ground on the west side of the Naktong against a possible American attack . There were occasional attacks on the 9th Infantry 's forward positions , but to the men in the front lines this appeared to be only a standard patrol action . On August 31 , the UN forces were alerted to a pending attack when much of the Korean civilian labor force fled the front lines . Intelligence officers reported an attack was coming . 
- On the west side of the Naktong , North Korean Major General Pak Kyo Sam , commanding the NK 9th Division , issued his operations order to the division on August 28 . Its mission in the forthcoming attack was to outflank and destroy the US troops at Naktong Bulge by capturing the Miryang and Samnangjin areas to cut off the US 2nd Division 's route of supply and withdrawal between Taegu and Pusan . However , the North Koreans weren 't aware that the US 2nd Infantry Division had recently replaced the US 24th Infantry Division in positions along the Naktong River . Consequently , they expected lighter resistance ; the 24th troops were exhausted from months of fighting but the 2nd Division men were fresh and newly arrived in Korea . They had only recently been moved into the line . The North Koreans began crossing the Naktong River under cover of darkness at certain points . 
- 
- = = = Battle of Agok = = = 
- 
- On the southern @-@ most flank of the 9th Infantry river line , just above the junction of the Nam River with the Naktong , A Company of the 1st Battalion was dug in on a long finger ridge paralleling the Naktong that terminates in Hill 94 at the Kihang ferry site . The river road from Namji @-@ ri running west along the Naktong passes the southern tip of this ridge and crosses to the west side of the river at the ferry . A small village called Agok lay at the base of Hill 94 and 300 yards ( 270 m ) from the river . A patrol of tanks and armored vehicles , together with two infantry squads of A Company , 9th Infantry , held a roadblock near the ferry and close to Agok . On the evening of August 31 , A Company moved from its ridge positions overlooking Agok and the river to new positions along the river below the ridge line . 
- That evening Sergeant Ernest R. Kouma led the patrol of two M26 Pershing tanks and two M19 Gun Motor Carriages in Agok . Kouma placed his patrol on the west side of Agok near the Kihang ferry . At 20 : 00 a heavy fog covered the river , and at 22 : 00 mortar shells began falling on the American @-@ held side of the river . By 22 : 15 this strike intensified and North Korean mortar preparation struck A Company 's positions . American mortars and artillery began firing counterbattery . Some of the A Company men reported they heard noises on the opposite side of the river and splashes in the water . 
- At 22 : 30 the fog lifted and Kouma saw that a North Korean pontoon bridge was being laid across the river directly in front of his position . Kouma 's four vehicles attacked this structure , and after about a minute of heavy fire the bridge collapsed , and the ponton boats used to hold the bridge in place were sunk . At 23 : 00 a small arms fight flared around the left side of A Company north of the tanks . This gunfire had lasted only two or three minutes when the A Company roadblock squads near the tanks received word by field telephone that the company was withdrawing to the original ridge positions and that they should do likewise . 
- Kouma 's patrol was then ambushed by a group of North Koreans dressed in US military uniforms . Kouma was wounded and the other three vehicles had to withdraw , but he held the Agok site until 07 : 30 the next morning with his single tank . In the attack against A Company , the North Koreans hit the 1st Platoon , which was near Agok , but they did not find the 2nd Platoon northward . 
- The NK 9th Division 's infantry crossing of the Naktong and attack on its east side near midnight quickly overran the positions of C Company , north of A Company . There the North Koreans assaulted in force , signaled by green flares and blowing of whistles . The company held its positions only a short time and then attempted to escape . Many of the men moved south , a few of them coming into A Company 's ridge line positions near Agok during the night . Most of C Company moved all the way to the 25th Division positions south of the Naktong . On September 1 that division reported that 110 men of C Company had come into its lines . 
- 
- = = = North Korean crossing = = = 
- 
- Meanwhile , 5 miles ( 8 @.@ 0 km ) north of Agok and A Company 's position , B Company , 9th Infantry , held a similar position on Hill 209 overlooking the Paekchin ferry crossing of the river . This ferry was located at the middle of the Naktong Bulge where the Yongsan road came down to the Naktong and crossed it . The US 2nd Infantry Division had planned a reconnaissance mission to start from there the night of August 31 , the same night that the NK I Corps offensive rolled across the river . 
- Near the end of the month two reconnaissance patrols from the 9th Infantry had crossed to the west side of the Naktong and observed North Korean tank and troop activity 2 miles ( 3 @.@ 2 km ) west of the river . Information obtained later indicated it was in fact the command post of the NK 9th Division . On August 25 , 9th Infantry commander Colonel John G. Hill outlined projected " Operation Manchu , " which was to be a company @-@ sized combat patrol to cross the river , advance to the suspected North Korean command post and communications center , destroy it , capture prisoners , and collect intelligence . 
- The 9th Infantry Regiment had planned Task Force Manchu on orders from the 2nd Division commander Major General Laurence B. Keiser , which in turn had received instructions from Eighth United States Army commander Lieutenant General Walton Walker for aggressive patrolling . Keiser decided the patrol should cross the river at the Paekchin ferry . The 9th Infantry reserve , E Company , reinforced with one section of light machine guns from H Company , was to be the attack force . The 1st Platoon , 2nd Engineer Combat Battalion , was to transport it across the river in assault boats the night of August 31 . Two heavy weapons companies , D and H , were each to furnish one section of heavy machine guns , one section of 81 @-@ mm. mortars , and one section of 75 @-@ mm. recoilless rifles for supporting fires . A platoon of 4 @.@ 2 @-@ inch mortars was also to give support . 
- After dark on August 31 , First Lieutenant Charles I. Caldwell of D Company and First Lieutenant Edward Schmitt of H Company , 9th Infantry , moved their men and weapons to the base of Hill 209 , which was within B Company 's defense sector and overlooked the Paekchin ferry crossing of the Naktong River . The raiding force , E Company , was still in its regimental reserve position about 2 miles ( 3 @.@ 2 km ) west of Yongsan , getting ready with the engineer platoon to move to the crossing site . Colonel Hill went forward in the evening with the 4 @.@ 2 @-@ inch mortar platoon to its position at the base of Hill 209 where the mortarmen prepared to set up their weapons . 
- By 21 : 00 , the closest front line unit was B Company on top of Hill 209 , 1 mile ( 1 @.@ 6 km ) north of the river road which curved around the hill 's southern base . The regimental chaplain , Captain Lewis B. Sheen , had gone forward in the afternoon to B Company to hold services . On top of Hill 209 , Chaplain Sheen and men in B Company after dark heard splashing in the water below them . They soon discovered a long line of North Korean soldiers wading the river . 
- The first North Korean crossing at the Paekchin ferry caught the Heavy Mortar Platoon unprepared in the act of setting up its weapons . It also caught most of the D and H Company men at the base of Hill 209 , .5 miles ( 0 @.@ 80 km ) from the crossing site . The North Koreans killed or captured many of the troops there . Hill was there , but escaped to the rear just before midnight , together with several others , when the division canceled Operation Manchu because of the attacks . The first heavy weapons carrying party was on its way up the hill when the North Korean attack engulfed the men below . It hurried on to the top where the advance group waited and there all hastily dug in on a small perimeter . This group was not attacked during the night . 
- From 21 : 30 until shortly after midnight the NK 9th Division crossed the Naktong at a number of places and climbed the hills quietly toward the 9th Infantry river line positions . Then , when the artillery barrage preparation lifted , the North Korean infantry were in position to launch their assaults . These began in the northern part of the regimental sector and quickly spread southward . At each crossing site the North Koreans would overwhelm local UN defenders before building pontoon bridges for their vehicles and armor . 
- At 02 : 00 , B Company was attacked . A truck stopped at the bottom of the hill , a whistle sounded , then came a shouted order , and North Korean soldiers started climbing the slope . The hills on both sides of B Company were already under attack as was also Hill 311 , a rugged terrain feature a 1 @.@ 5 miles ( 2 @.@ 4 km ) from the river and the North Koreans ' principal immediate objective . The North Koreans apparently were not aware of the Task Force Manchu group lower down on the hill and it was not attacked during the night . But higher up on Hill 209 the North Koreans drove B Company from its position , inflicting very heavy casualties on it . Sheen led one group of soldiers back to friendly lines on 4 September . 
- At 03 : 00 , 1 September , the 9th Infantry Regiment ordered its only reserve , E Company to move west along the Yongsan @-@ Naktong River road and take a blocking position at the pass between Cloverleaf Hill and Obong @-@ ni Ridge , 3 miles ( 4 @.@ 8 km ) from the river and 6 miles ( 9 @.@ 7 km ) from Yongsan . This was the critical terrain where so much heavy fighting had taken place in the first battle of the Naktong Bulge . Fighting began at the pass at 02 : 30 when an American medium tank of A Company , 72nd Tank Battalion , knocked out a T @-@ 34 at Tugok , also called Morisil . E Company never reached its blocking position . A strong North Korean force surprised and delivered heavy automatic fire on it at 03 : 30 from positions astride the road east of the pass . The company suffered heavy casualties , including the company commander and Keiser 's aide who had accompanied the force . With the critical parts of Cloverleaf Hill and Obong @-@ ni Ridge , the best defensive terrain between Yongsan and the river , the North Koreans controlled the high ground . The US 2nd Infantry Division now had to base its defense of Yongsan on relatively poor defensive terrain , the low hills at the western edge of the town . 
- 
- = = = US 23rd Infantry attacked = = = 
- 
- North of the 9th Infantry sector of the 2nd Infantry Division front along the Naktong , the US 23rd Infantry Regiment on August 29 had just relieved the 3rd Battalion of the US 38th Infantry Regiment , which in turn had only a few days before relieved the US 21st Infantry Regiment of the US 24th Infantry Division . On August 1 , the 23rd Regiment was in a new sector of which it had only a limited knowledge . It took over a 16 @,@ 000 yards ( 15 @,@ 000 m ) Naktong River front without its 3rd Battalion which had been attached to the US 1st Cavalry Division to the north . Colonel Paul L. Freeman , the regimental commander , deployed the 1st Battalion on the high ground along the river with the three companies abreast . The 1st Battalion , under US Lieutenant Colonel Claire E. Hutchin , Jr . , outposted the hills with platoons and squads . He placed the 2nd Battalion in a reserve position 8 miles ( 13 km ) behind the 1st Battalion and in a position where it commanded the road net in the regimental sector . On August 31h the 2nd Division moved E Company south to a reserve position in the 9th Infantry sector . 
- Two roads ran through the regimental sector from the Naktong River to Changnyong . The main road bent south along the east bank of the river to Pugong @-@ ni and then turned northeast to Changnyong . A northern secondary road curved around marshland and lakes , the largest of which was Lake U @-@ p 'o , to Changnyong . In effect , the 1st Battalion of the 23rd Regiment guarded these two approach routes to Changnyong . 
- The 42 men of the 2nd Platoon , B Company , 23rd Infantry held outpost positions on seven hills covering a 2 @,@ 600 yards ( 2 @,@ 400 m ) front along the east bank of the Naktong north of Pugong @-@ ni . Across the river in the rice paddies they could see , in the afternoon of August 31 , two large groups of North Korean soldiers . Occasionally artillery fire dispersed them . Just before dark , the platoon saw a column of North Koreans come out of the hills and proceed toward the river . They immediately reported to the battalion command post . The artillery forward observer , who estimated the column at 2 @,@ 000 people , thought they were refugees . Freeman immediately ordered the artillery to fire on the column , reducing its number . However the North Koreans continued their advance . 
- At 21 : 00 the first shells of what proved to be a two @-@ hour North Korean artillery and mortar preparation against the American river positions of 2nd Platoon . As the barrage rolled on , North Korean infantry crossed the river and climbed the hills in the darkness under cover of its fire . At 23 : 00 the barrage lifted and the North Koreans attacked 2nd Platoon , forcing it from the hill after a short fight . Similar assaults took place elsewhere along the battalion outpost line . 
- On the regimental left along the main Pugong @-@ ni @-@ Changnyong road North Korean soldiers completely overran C Company by 0300 September 1 . Only seven men of C Company could be accounted for , and three days later , after all the stragglers and those cut off behind North Korean lines had come in , there were only 20 men in the company . As the North Korean attack developed during the night , 1st Battalion succeeded in withdrawing a large part of its force , less C Company , just north of Lake U @-@ p 'o and the hills there covering the northern road into Changnyong , 3 miles ( 4 @.@ 8 km ) east of the river and 5 miles ( 8 @.@ 0 km ) west of the town . B Company lost heavily in this action . 
- When word of the disaster that had overtaken 1st Battalion reached regimental headquarters , Freeman obtained the release of G and F Companies from 2nd Division reserve and sent the former to help 1st Battalion and the latter on the southern road toward Pugong @-@ ni and C Company . Major Lloyd K. Jenson , executive officer of the 2nd Battalion , accompanied F Company down the Pugong @-@ ni road . This force was unable to reach C Company , but Jenson collected stragglers from it and seized high ground astride this main approach to Changnyong near Ponch 'o @-@ ri above Lake Sanorho , and went into a defensive position there . The US 2nd Division released E Company to the regiment and the next day it joined F Company to build up what became the main defensive position of the 23d Regiment in front of Changnyong . North Korean troops during the night passed around the right flank of 1st Battalion 's northern blocking position and reached the road three miles behind him near the division artillery positions . The 23rd Infantry Headquarters and Service Companies and other miscellaneous regimental units finally stopped this penetration near the regimental command post 5 miles ( 8 @.@ 0 km ) northwest of Changnyong . 
- 
- = = = US 2nd Division split = = = 
- 
- Before the morning of 1 September had passed , reports coming in to US 2nd Division headquarters made it clear that North Koreans had penetrated to the north @-@ south Changnyong @-@ Yongsan road and cut the division in two ; the 38th and 23d Infantry Regiments with the bulk of the division artillery in the north were separated from the division headquarters and the 9th Infantry Regiment in the south . Keiser decided that this situation made it advisable to control and direct the divided division as two special forces . Accordingly , he placed the division artillery commander , Brigadier General Loyal M. Haynes , in command of the northern group . Haynes ' command post was 7 miles ( 11 km ) north of Changnyong . Task Force Haynes became operational at 10 : 20 , September 1 . Southward , in the Yongsan area , Keiser placed Brigadier General Joseph S. Bradley , Assistant Division Commander , in charge of the 9th Infantry Regiment , the 2nd Engineer Combat Battalion , most of the 72nd Tank Battalion , and other miscellaneous units of the division . This southern grouping was known as Task Force Bradley . 
- All three regiments of the NK 2nd Division @-@ the 4th , 17th , and 6th , in line from north to south @-@ crossed during the night to the east side of the Naktong River into the 23rd Regiment sector . The NK 2nd Division , concentrated in the Sinban @-@ ni area west of the river , had , in effect , attacked straight east across the river and was trying to seize the two avenues of advance into Changnyong above and below Lake U @-@ p 'o . On August 31 , 1950 , Lake U @-@ p 'o was a large body of water although in most places very shallow . 
- At dawn September 1 , Keiser at 2nd Division headquarters in Muan @-@ ni , 7 miles ( 11 km ) east of Yongsan on the Miryang road , felt his division was in the midst of a crisis . The massive North Korean attack had made deep penetrations everywhere in the division sector except in the north in the zone of the 38th Infantry . The NK 9th Division had effected major crossings of the Naktong at two principal points against the US 9th Infantry ; the NK 2nd Division in the meantime had made three major crossings against the US 23rd Infantry ; and the NK 10th Division had begun crossing more troops in the Hill 409 area near Hyongp 'ung in the US 38th Infantry sector . At 08 : 10 Keiser telephoned Eighth Army headquarters and reported the heaviest and deepest North Korean penetrations were in the 9th Infantry sector . 
- Liaison planes rose from the division strip every hour to observe the North Korean progress and to locate US 2nd Infantry Division front @-@ line units . Communication from division and regimental headquarters to nearly all the forward units was broken . Beginning at 09 : 30 and continuing throughout the rest of the day , the light aviation section of the division artillery located front @-@ line units cut off by the North Koreans , and made fourteen airdrops of ammunition , food , water , and medical supplies . As information slowly built up at division headquarters it became apparent that the North Koreans had punched a hole 6 miles ( 9 @.@ 7 km ) wide and 8 miles ( 13 km ) deep in the middle of the division line and made less severe penetrations elsewhere . The front @-@ line battalions of the US 9th and 23rd Regiments were in various states of disorganization and some companies had virtually disappeared . Keiser hoped he could organize a defense along the Changnyong @-@ Yongsan road east of the Naktong River , and prevent North Korean access to the passes eastward leading to Miryang and Ch 'ongdo . 
- 
- = = = Reinforcements = = = 
- 
- At 09 : 00 Walker requested the US Air Force to make a maximum effort along the Naktong River from Toksong @-@ dong , just above the US 2nd Division boundary , southward and to a depth of 15 miles ( 24 km ) west of the river . He wanted the Air Force to isolate the battlefield and prevent further North Korean reinforcements and supplies from moving across the river in support of the North Korean spearhead units . The Far East Command requested the US Navy to join in the air effort , and the US Seventh Fleet turned back from its strikes in the Inch 'on @-@ Seoul area and sped southward at full steam toward the southern battle front . Walker came to the US 2nd Division front at 12 : 00 and ordered the division to hold at all costs . He had already ordered ground reinforcements to the Yongsan area . 
- During the morning of 1 September , Walker weighed the news coming in from his southern front , wavering in a decision as to which part of the front most needed his Pusan Perimeter reserves . Since midnight the NK I Corps had broken his Pusan Perimeter in two places @-@ the NK 2nd and 9th Divisions in the US 2nd Division sector , and the NK 7th Division and NK 6th Division in the US 25th Division sector , below the junction of the Nam and Naktong Rivers . In the US 2nd Division sector North Korean troops were at the edge of Yongsan , the gateway to the corridor leading 12 miles ( 19 km ) eastward to Miryang and the main Pusan @-@ Mukden railroad and highway . 
- Eighth Army had in reserve three understrength infantry regiments and the 2 @-@ battalion British 27th Infantry Brigade which was not yet completely equipped and ready to be placed in line : The 1st Provisional Marine Brigade at Changwon , 6 miles ( 9 @.@ 7 km ) northeast of Masan , preparing for movement to the port of Pusan ; the US 27th Infantry Regiment of the 25th Division which had arrived at Masan only the night before at 20 : 30 to relieve the 5th Regimental Combat Team , which was then to join the 24th Division in the Taegu area ; and the US 19th Infantry Regiment of the US 24th Infantry Division , then with that division 's headquarters at Kyongsan southeast of Taegu . Walker alerted both the 24th Division headquarters , together with its 19th Regiment , and the 1st Provisional Marine Brigade to move at a moment 's notice ; the 24th Division either to the 2nd or 25th Division fronts , and the marines to an unannounced destination . 
- As the morning passed , General Walker decided that the situation was most critical in the Naktong Bulge area of the US 2nd Division sector . There the North Koreans threatened Miryang and with it the entire Eighth Army position . At 11 : 00 Walker ordered US Marine Corps Brigadier General Edward A. Craig , commanding the 1st Provisional Marine Brigade , to prepare the marines to move at once . The marines made ready to depart for the Naktong Bulge at 13 : 30 . 
- 
- = = = North Korean advance = = = 
- 
- The situation on the front was chaotic during the day September 1 . The North Koreans at one place had crossed at the Kihang ferry , captured Agok , and scattered A Company , 9th Infantry at its positions from Agok northward . A Company withdrew to positions on the ridge line back of the river . From there at daylight the men could see North Korean soldiers on many of the ridges surrounding them , most of them moving east . After several hours , 2nd Platoon of A Company sent a patrol down the hill to Agok to obtain supplies abandoned there during the night , returning later with much needed water , rations , and ammunition . 
- Later in the morning North Korean barges crossed the Naktong below A Company . The company sent a squad with a light machine gun to the southern tip of the ridge overlooking Agok to take these troops under fire . When the squad reached the tip of the ridge they saw that a North Korean force occupied houses at its base . The company hit these houses with artillery . The North Koreans broke from the houses , running for the river . At this the light machine gun at the tip of the ridge took them under fire , as did another across the Naktong to the south in the US 25th Infantry Division sector . Proximity fuze artillery fire decimated this group . Combined fire from all weapons inflicted an estimated 300 casualties on this North Korean force . In the afternoon , US aircraft dropped food and ammunition to the company ; only part of it was recovered . The 1st Battalion ordered A Company to withdraw the company that night . 
- During the withdraw , however , A Company ran into a sizable North Korean force and had scattered in the ensuing fight . Most of the company , including its commander were killed at close range . In this desperate action , Private First Class Luther H. Story , a weapons squad leader , fought so tenaciously that he was awarded the Medal of Honor . Badly wounded , Story refused to be a burden to those who might escape , and when last seen was still engaging North Korean at close range . Of those in the company , approximately ten men escaped to friendly lines . The next morning , under heavy fog , the group made its way by compass toward Yongsan . From a hill at 12 : 00 , after the fog had lifted , the men looked down on the Battle of Yongsan which was then in progress . That afternoon 20 survivors of the company merged into the lines of the 72nd Tank Battalion near Yongsan . Stragglers from this position continued to stream in the next few days as well . 
- 
- = = = The end of Task Force Manchu = = = 
- 
- In the meantime , Task Force Manchu was still holding its position along the Naktong River , about 5 miles ( 8 @.@ 0 km ) north of where A Company had been destroyed on the southern end of the line . The perimeter position taken by the men of D and H Companies , 9th Infantry , who had started up the hill before the North Koreans struck , was on a southern knob of Hill 209 , 0 @.@ 5 miles ( 0 @.@ 80 km ) south of B Company 's higher position . In addition to the D and H Company men , there were a few from the Heavy Mortar Platoon and one or two from B Company . Altogether , 60 to 70 men were in the group . The group had an SCR @-@ 300 radio , a heavy machine gun , two light machine gun , a M1918 Browning Automatic Rifle , about 20 M1 Garand rifles , and about 40 carbines or pistols . Schmitt assumed command of the group . 
- During the night Schmitt established radio communication with the 1st Battalion , 9th infantry . When daylight came Schmitt and his group saw that they were surrounded by North Koreans . One force occupied the higher knob half a mile above them , formerly held by B Company . Below them , North Koreans continued crossing the river and moving supplies forward to their combat units , some of them already several miles eastward . The North Koreans quickly discovered Task Force Manchu group . They first attacked it at 14 : 00 that afternoon , and were repulsed . That night an estimated company attacked three times , pressing the fight to close quarters , but failed each time to penetrate the tight US perimeter . Daylight of the second day disclosed many North Korean dead on the steep slopes outside the perimeter . 
- In the afternoon of September 2 Schmitt radioed 1st Battalion for an airdrop of supplies . A US plane attempted the drop , but the perimeter was so small and the slopes so steep that virtually all the supplies went into North Korean hands . The men in the perimeter did , however , recover from a drop made later at 19 : 00 some supplies and ammunition . Private First Class Joseph R. Ouellette , of H Company , left the perimeter to gather weapons , ammunition , and grenades from the North Korean dead . On several occasions he was attacked , and on one such occasion a North Korean soldier suddenly attacked Ouellette , who killed the North Korean in hand @-@ to @-@ hand combat . 
- That same afternoon , the North Koreans sent an American prisoner up the hill to Schmitt with the message , " You have one hour to surrender or be blown to pieces . " Failing in frontal infantry attack to reduce the little defending force , the North Koreans now meant to take it under mortar fire . Only 45 minutes later North Korean antitank fire came in on the knob and two machine guns from positions northward and higher on the slope of Hill 209 swept the perimeter . Soon , mortars emplaced on a neighboring high finger ridge eastward registered on Schmitt 's perimeter and continued firing until dark . The machine gun fire forced every man to stay in his foxhole . The lifting of the mortar fire after dark was the signal for renewed North Korean infantry attacks , all of which were repulsed . But the number of killed and wounded within the perimeter was growing , and supplies were diminishing . There were no medical supplies except those carried by one aid man . 
- The third day , September 3 , the situation worsened . The weather was hot and ammunition , food and supplies were nearly completely exhausted . Since the previous afternoon , North Korean mortar barrages had alternated with infantry assaults against the perimeter . Survivors later estimated there were about twenty separate infantry attacks repulsed . Two North Korean machine guns still swept the perimeter whenever anyone showed himself . Dead and dying US troops were in almost every foxhole . Mortar fragments destroyed the radio and this ended all communication with other US units . Artillery fire and air strikes requested by Schmitt never came . Some North Koreans worked their way close to the perimeter and threw grenades into it . Six times Ouellette leaped from his foxhole to escape grenades thrown into it . In this close action Ouellette was killed . Most of the foxholes of the perimeter received one or more direct mortar hits in the course of the continuing mortar fire . One of these killed Schmitt on September 3 . The command passed now to First Lieutenant Raymond J. McDoniel of D Company , senior surviving officer . 
- At daylight on the morning of 4 September only two officers and approximately half the men who had assembled on the hill , were alive . As the day passed , with ammunition down to about one clip per man and only a few grenades left and no help in sight , McDoniel decided to abandon the position that night . When it got dark the survivors would split into small groups and try to get back to friendly lines . That evening after dark the North Koreans launched another weak attack against the position . At 22 : 00 , McDoniel and Caldwell and 27 enlisted men slipped off the hill in groups of four . Master Sergeant Travis E. Watkins , still alive in his paralyzed condition , refused efforts of evacuation , saying that he did not want to be a burden to those who had a chance to get away . He asked only that his carbine be loaded and placed on his chest with the muzzle under his chin . Like Oullette , he would also win the Medal of Honor for his actions . Of the 29 men who came off the hill the night of September 4 , 22 escaped to friendly lines , many of them following the Naktong downstream , hiding by day and traveling by night , until they reached the lines of the US 25th Infantry Division . 
- Members of Task Force Manchu who escaped from Hill 209 brought back considerable intelligence information of North Korean activity in the vicinity of the Paekchin ferry crossing site . At the ferry site the North Koreans had put in an underwater bridge . A short distance downstream , each night they placed a pontoon bridge across the river and took it up before dawn the next morning . Carrying parties of 50 civilians guarded by four North Korean soldiers crossed the river continuously at night , an estimated total of 800 @-@ 1 @,@ 000 carriers being used at this crossing site . 
- 
- = = = Changyong = = = 
- 
- North of the US 9th Infantry and the battles in the Naktong Bulge and around Yongsan , the US 23d Infantry Regiment after daylight of September 1 was in a very precarious position . Its 1st Battalion had been driven from the river positions and isolated 3 miles ( 4 @.@ 8 km ) westward . Approximately 400 North Koreans now overran the regimental command post , compelling Freeman to withdraw it about 600 yards ( 550 m ) . There , 5 miles ( 8 @.@ 0 km ) northwest of Changnyong , the US 23rd Infantry Headquarters and Headquarters Company , miscellaneous regimental units , and regimental staff officers checked the North Koreans in a 3 @-@ hour fight . 
- The North Koreans advanced to Changnyong itself during the afternoon of September 2 , and ROK National Police withdrew from the town . North Koreans were in Changnyong that evening . With his communications broken southward to the 2nd Infantry Division headquarters and the 9th Infantry , Haynes during the day decided to send a tank patrol down the Yongsan road in an effort to re @-@ establish communication . C Company , 72nd Tank Battalion , led its tanks southward . They had to fight their way down the road through several roadblocks . Of the three tanks that started , only the lead tank got through to Yongsan . There , it delivered an overlay of Task Force Haynes ' positions to Bradley . 
- Still farther northward in the zone of the US 38th Infantry the North Koreans were also active . After the North Korean breakthrough during the night of August 31 , Keiser had ordered the 2nd Battalion , 38th Infantry , to move south and help the 23rd Infantry establish a defensive position west of Changnyong . In attempting to do this , the battalion found North Korean troops already on the ridges along the road . They had penetrated to Hill 284 overlooking the 38th Infantry command post . This hill and Hill 209 dominated the rear areas of the regiment . At 06 : 00 September 3 , 300 North Koreans launched an attack from Hill 284 against the 38th Regiment command post . The regimental commander organized a defensive perimeter and requested a bombing strike which was denied him because the enemy target and his defense perimeter were too close to each other . But the Air Force did deliver rocket and strafing strikes . 
- This fight continued until September 5 . On that day F Company captured Hill 284 killing 150 North Koreans . From the crest he and his men watched as many more North Koreans ran into a village below them . Directed artillery fire destroyed the village . Among the abandoned North Korean materiel on the hill , Schauer 's men found twenty @-@ five American BARs and submachine guns , a large American radio , thirty boxes of unopened American fragmentation and concussion grenades , and some American rations . 
- 
- = = = 1 @-@ 23rd Infantry isolated = = = 
- 
- Meanwhile , during these actions in its rear , the 1st Battalion , 23rd Infantry , was cut off 3 miles ( 4 @.@ 8 km ) west of the nearest friendly units . On September 1 the regiment ordered it to withdraw to the Changnyong area . At 14 : 00 a tank @-@ infantry patrol was sent down the road , but it reported that an estimated North Korean battalion held the mountain pass just eastward of the battalion 's defense perimeter . Upon receiving this report the battalion commander requested permission by radio to remain in his present position and try to obstruct the movement of North Korean reinforcements and supplies . That evening Freeman approved this request , and 1st Battalion spent three days in the isolated positions . During this time C @-@ 47 Skytrain planes supplied the battalion by airdrops . 
- On the morning of September 1 , 3rd Battalion , 38th Infantry moved in an attack westward from the 23rd Regiment command post near Mosan @-@ ni to open the road to the 1st Battalion . On the second day of the fighting at the pass , the relief force broke through the roadblock with the help of air strikes and artillery and tank fire . The advanced elements of the battalion joined 1st Battalion at 17 : 00 September 2 . That evening , North Koreans strongly attacked the 3rd Battalion , 38th Infantry , on Hill 209 north of the road and opposite 1st Battalion , driving one company from its position . 
- On September 4 , Haynes changed the boundary between the 38th and 23rd Infantry Regiments , giving the northern part of the 23rd 's sector to the 38th Infantry , thus releasing 1st Battalion for movement southward to help the 2nd Battalion defend the southern approach to Changnyong . The 1st Battalion , 23rd Infantry , about 1 @,@ 100 men strong when the attack began , was now down to a strength of approximately 600 men . The 23rd Infantry now made plans to concentrate all its troops on the position held by its 2nd Battalion on the Pugong @-@ ni @-@ Changnyong road . The 1st Battalion moved there and took a place on the left flank of the 2nd Battalion . At the same time the regimental command post moved to the rear of this position . In this regimental perimeter , the 23rd Infantry fought a series of hard battles . Simultaneously it had to send combat patrols to its rear to clear infiltrating North Koreans from Changnyong and from its supply road . 
- 
- = = = Battle of Yongsan = = = 
- 
- On the morning of September 1 the 1st and 2nd Regiments of the NK 9th Division , in their first offensive of the war , stood only a few miles short of Yongsan after a successful river crossing and penetration of the American line . The 3rd Regiment had been left at Inch 'on , but division commander Major General Pak Kyo Sam felt the chances of capturing Yongsan were strong . 
- On the morning of September 1 , with only the shattered remnants of E Company at hand , the US 9th Infantry Regiment , US 2nd Infantry Division had virtually no troops to defend Yongsan . Keiser in this emergency attached the 2nd Engineer Combat Battalion to the regiment . The US 72nd Tank Battalion and the 2nd Division Reconnaissance Company also were assigned positions close to Yongsan . The regimental commander planned to place the engineers on the chain of low hills that arched around Yongsan on the northwest . 
- A Company , 2nd Engineer Combat Battalion , moved to the south side of the Yongsan @-@ Naktong River road ; D Company of the 2nd Engineer Battalion was on the north side of the road . Approximately 2 miles ( 3 @.@ 2 km ) west of Yongsan an estimated 300 North Korean troops engaged A Company in a fire fight . M19 Gun Motor Carriages of the 82nd AAA Battalion supported the engineers in this action , which lasted several hours . Meanwhile , with the approval of General Bradley , D Company moved to the hill immediately south of and overlooking Yongsan . A platoon of infantry went into position behind it . A Company was now ordered to fall back to the southeast edge of Yongsan on the left flank of D Company . There , A Company went into position along the road ; on its left was C Company of the Engineer battalion , and beyond C Company was the 2nd Division Reconnaissance Company . The hill occupied by D Company was in reality the western tip of a large mountain mass that lay southeast of the town . The road to Miryang came south out of Yongsan , bent around the western tip of this mountain , and then ran eastward along its southern base . In its position , D Company not only commanded the town but also its exit , the road to Miryang . 
- North Koreans had also approached Yongsan from the south . The US 2nd Division Reconnaissance Company and tanks of the 72nd Tank Battalion opposed them in a sharp fight . In this action , Sergeant First Class Charles W. Turner of the Reconnaissance Company particularly distinguished himself . He mounted a tank , operated its exposed turret machine gun , and directed tank fire which reportedly destroyed seven North Korean machine guns . Turner and this tank came under heavy North Korean fire which shot away the tank 's periscope and antennae and scored more than 50 hits on it . Turner , although wounded , remained on the tank until he was killed . That night North Korean soldiers crossed the low ground around Yongsan and entered the town from the south . 
- At 09 : 35 September 2 , while the North Koreans were attempting to destroy the engineer troops at the southern edge of Yongsan and clear the road to Miryang , Walker spoke by telephone with Major General Doyle O. Hickey , Deputy Chief of Staff , Far East Command in Tokyo . He described the situation around the Perimeter and said the most serious threat was along the boundary between the US 2nd and US 25th Infantry Divisions . He described the location of his reserve forces and his plans for using them . He said he had started the 1st Provisional Marine Brigade toward Yongsan but had not yet released them for commitment there and he wanted to be sure that General of the Army Douglas MacArthur approved his use of them , since he knew that this would interfere with other plans of the Far East Command . Walker said he did not think he could restore the 2nd Division lines without using them . Hickey replied that MacArthur had the day before approved the use of the US Marines if and when Walker considered it necessary . A few hours after this conversation Walker , at 13 : 15 , attached the 1st Provisional Marine Brigade to the US 2nd Division and ordered a co @-@ ordinated attack by all available elements of the division and the marines , with the mission of destroying the North Koreans east of the Naktong River in the 2nd Division sector and of restoring the river line . The marines were to be released from 2nd Division control as soon as this mission was accomplished . 
- A decision was reached that the marines would attack west at 08 : 00 on September 3 astride the Yongsan @-@ Naktong River road ; the 9th Infantry , B Company of the 72nd Tank Battalion , and D Battery of the 82d AAA Battalion would attack northwest above the marines and attempt to re @-@ establish contact with the US 23rd Infantry ; the 2nd Engineer Combat Battalion , remnants of the 1st Battalion , 9th Infantry , and elements of the 72nd Tank Battalion would attack on the left flank , or south , of the marines to reestablish contact with the 25th Division . Eighth Army now ordered the US 24th Infantry Division headquarters and the US 19th Infantry Regiment to move to the Susan @-@ ni area , 8 miles ( 13 km ) south of Miryang and 15 miles ( 24 km ) east of the confluence of the Nam River and the Naktong River . There it was to prepare to enter the battle in either the 2nd or 25th Division zone . 
- The American counteroffensive of September 3 – 5 west of Yongsan , according to prisoner statements , resulted in one of the bloodiest debacles of the war for a North Korean division . Even though remnants of the NK 9th Division , supported by the low strength NK 4th Division , still held Obong @-@ ni Ridge , Cloverleaf Hill , and the intervening ground back to the Naktong on September 6 , the division 's offensive strength had been spent at the end of the American counterattack . The NK 9th and 4th divisions were not able to resume the offensive . 
- 
- = = = NK 2nd Division destroyed = = = 
- 
- The NK 2nd Division made a new effort against the 23rd Infantry 's perimeter in the predawn hours of September 8 , in an attempt to break through eastward . This attack , launched at 02 : 30 and heavily supported with artillery , penetrated F Company . It was apparent that unless F Company 's position could be restored the entire regimental front would collapse . When all its officers became casualties , First Lieutenant Ralph R. Robinson , adjutant of the 2nd Battalion , assumed command of the company . With North Koreans rapidly infiltrating his company 's position and gaining its rear , Robinson in the darkness made his way through them 500 yards ( 460 m ) to A Company 's position . There he obtained that company 's reserve platoon and brought it back to F Company . He accomplished the dangerous and difficult task of maneuvering it into the gap in F Company 's lines in darkness and heavy rain . 
- The attack tapered off with the coming of daylight , but that night it resumed . The North Koreans struck repeatedly at the defense line . This time they continued the fighting into the daylight hours of 9 September . The Air Force then concentrated strong air support over the regimental perimeter to aid the ground troops . Casualties came to the aid stations from the infantry companies in an almost steady stream during the morning . All available men from Headquarters Company and special units were formed into squads and put into the fight at the most critical points . At one time , the regimental reserve was down to six men . When the attack finally ceased shortly after 12 : 00 the 23rd Regiment had an estimated combat efficiency of only 38 percent . 
- This heavy night and day battle cost the NK 2nd Division most of its remaining offensive strength . The medical officer of the NK 17th Regiment , 2nd Division , captured a few days later , said that the division evacuated about 300 men nightly to a hospital in Pugong @-@ ni , and that in the first two weeks of September the 2nd Division lost 1 @,@ 300 killed and 2 @,@ 500 wounded in the fighting west of Changnyong . Even though its offensive strength was largely spent by September 9 , the division continued to harass rear areas around Changnyong with infiltrating groups as large as companies . Patrols daily had to open the main supply road and clear the town . 
- North Korean and US troops remained locked in combat along the Naktong River for several more days . The North Koreans ' offensive capability was largely destroyed , and the US troops resolved to hold their lines barring further attack . 
- 
- = = = North Korean withdrawal = = = 
- 
- The UN counterattack at Inchon collapsed the North Korean line and cut off all their main supply and reinforcement routes . On September 19 the UN discovered the North Koreans had abandoned much of the Pusan Perimeter during the night , and the UN units began advancing out of their defensive positions and occupying them . Most of the North Korean units began conducting delaying actions attempting to get as much of their army as possible into North Korea . The North Koreans withdrew from the Masan area first , the night of September 18 – 19 . After the forces there , the remainder of the North Korean armies withdrew rapidly to the North . The US units rapidly pursued them north , passing over the Naktong River positions , which were no longer of strategic importance . 
- 
- = = Aftermath = = 
- 
- The North Korean 2nd and 9th Divisions were almost completely destroyed in the battles . The 9th Division had numbered 9 @,@ 350 men at the beginning of the offensive on September 1 . The 2nd Division numbered 6 @,@ 000 . Only a few hundred from each division returned to North Korea after the fight . The majority of the North Korean troops had been killed , captured or deserted . All of NK II Corps was in a similar state , and the North Korean army , exhausted at Pusan Perimeter and cut off after Inchon , was on the brink of defeat . 
- By this time , the US 2nd Infantry Division suffered 1 @,@ 120 killed , 2 @,@ 563 wounded , 67 captured and 69 missing during its time at Pusan Perimeter . This included about 180 casualties it suffered during the First Battle of Naktong Bulge the previous month . American forces were continually repulsed but able to prevent the North Koreans from breaking the Pusan Perimeter . The division had numbered 17 @,@ 498 on September 1 , but was in excellent position to attack despite its casualties . The 1st Provisional Marine Brigade suffered 185 killed and around 500 wounded during the Battle of Pusan Perimeter , most of which probably occurred at Yongsan . 
- Of all the North Korean attacks along the Pusan Perimeter , the Second Battle of Naktong Bulge is seen by historians as the most serious threat . It was the battle in which the North Koreans made the most substantial gains , splitting the US 2nd Infantry Division in half and briefly capturing Yongsan , where they were very close to breaching through to the US forces ' supply lines and threatening other divisions ' rear areas . However , once again the fatal weakness of the North Korean Army had cost it victory after an impressive initial success — its communications and supply were not capable of exploiting a breakthrough and of supporting a continuing attack in the face of massive air , armor , and artillery fire that could be concentrated against its troops at critical points . By September 8 , the North Korean attacks in the area had been repulsed . 
- 
- 
- = Hed PE = 
- 
- Hed PE , also known as ( hed ) Planet Earth and stylized as ( həd ) p.e. , is an American rock band from Huntington Beach , California . Formed in 1994 , the band performs a style of music which it refers to as " G @-@ punk " , a fusion of punk rock and gangsta rap . 
- After releasing three albums on Jive Records , Hed PE left the label to record independently , eventually signing with Suburban Noize Records in 2006 . Since 2006 , the band has become known for its involvement in the 9 / 11 Truth movement , referencing it in many of their song lyrics and concerts , as well as the concept of the album New World Orphans . To date , they have released nine studio albums , one live album and two compilation albums . 
- 
- = = History = = 
- 
- 
- = = = Formation and major @-@ label debut ( 1994 – 1999 ) = = = 
- 
- The band was formed by vocalist Jared Gomes , formerly of The Clue , also known as " M.C.U.D. " ( MC Underdog ) , and guitarist Wes Geer , who became friends amidst the Orange County hardcore punk scene . Gomes and Geer recruited guitarist Chizad , bassist Mawk , drummer B.C. Vaught and DJ Product © 1969 . They named the group " Hed " , which stands for " higher education " . The band built a following with their energetic performances at local venues , and released the self @-@ financed extended play , Church of Realities . Legal issues forced Hed to change their name , adding " PE " , which stood for " Planet Earth " . 
- Hed PE signed with Jive Records , releasing their self @-@ titled debut album in 1997 . In his review of the album , Allmusic 's Steve Huey wrote " There are some slow and / or unfocused moments [ ... ] but overall , its aggression will probably play well with late- ' 90s metal and punk fans . " Due to the label 's contractual terms and the disappointing sales of the album , the band found themselves unable to repay the cash advances given to them by Jive . Gomes is quoted as saying " We had these romantic visions of the music industry , and we thought it would be cool to be a punk band on a rap label . So we fulfilled that dream , but it was also probably the worst thing that could have happened . [ ... ] We 've had offers from Sony and others that we can 't take because we owe Jive so much money . " 
- 
- = = = Broke and Blackout ( 2000 – 2004 ) = = = 
- 
- On June 6 , 2000 , Hed PE appeared on the tribute album Nativity in Black II , covering Black Sabbath 's " Sabbra Cadabra " . Hed PE released their second studio album , Broke on August 22 , 2000 . It peaked at No. 63 on the Billboard 200 , while its first single , " Bartender " , peaked at No. 23 on the Billboard Mainstream Rock Tracks chart and at No. 27 on the Modern Rock Tracks chart . Allmusic 's Jason D. Taylor wrote : " Broke may have not found as much success in the competitive mainstream market as some would have liked , and even despite its distinct departure from the group 's debut , it is an album that shows more vision than other rap @-@ tinged rock albums to come out in 2000 . " The most negative response to the album came from critics who viewed its lyrics as misogynistic . 
- On October 27 , 2000 , Gomes was arrested for possession of marijuana while the band was performing in Waterbury , Connecticut . He was released on a US $ 1 @,@ 500 bond . In 2001 , Hed PE performed on the Ozzfest tour alongside bands such as Korn , Static @-@ X , and System of a Down . A music video for " Killing Time " , the second single from Broke , was produced in promotion of the film 3000 Miles to Graceland , which featured the song on its soundtrack . 
- Hed PE released their third studio album , Blackout , on March 18 , 2003 . It peaked at No. 33 on the Billboard 200 , while its title track peaked at No. 21 on the Mainstream Rock Tracks chart and at No. 32 on the Modern Rock Tracks chart . Allmusic 's Johnny Loftus wrote that " While it expands on melodic elements that had previously played a supporting role in the band 's sound , Blackout also delivers truckloads of crushing guitar and pounding rhythm . And whether or not it is the presence of a top @-@ line producer , ( hed ) pe have figured out a way to imbue their aggressive mix of heavy rock and hip @-@ hop with some serious hooks . " Guitarist Jaxon joined the band in early 2004 . He is the fourth person to fill this position . 
- 
- = = = Only in Amerika ( 2004 ) = = = 
- 
- Hed PE left Jive Records , releasing their fourth studio album , Only in Amerika , on Koch Records on October 19 , 2004 . It peaked at No. 20 on the Top Independent Albums chart and at No. 186 on the Billboard 200 . In his review of the album , Johnny Loftus wrote " It wants to be a confrontational megaphone in the ear of conservatives , but Jahred 's torrential rhetoric is too messy and blatantly offensive to incite anything but superficial anger , and the music -- though occasionally explosive -- takes a backseat to the ranting . " 
- 
- = = = Suburban Noize Records and New Album Evolution ( 2006 – 2015 ) = = = 
- 
- In 2006 , Hed PE signed with Suburban Noize Records , recording their fifth studio album , Back 2 Base X. The album was intended as a return to the basics of rock music , and did not rely as heavily on studio enhancement as previous releases . The album was released on June 6 , 2006 , the same day as The Best of ( həd ) Planet Earth , a compilation album produced by Jive Records without the band 's authorization or consent . Back 2 Base X peaked at No. 12 on the Independent Albums chart , and at No. 154 on the Billboard 200 . Allmusic 's Rob Theakston wrote that " Back 2 Base X suffers from the same problems as Amerika : it tries to be conceptual in thought à la Tool and vicious in its political commentary à la Fugazi or System of a Down , but somehow falls short by sounding like an angry stoner on a soapbox . It won 't win any new fans , but existing fans of ( hed ) pe 's work won 't be turning their backs away from the band in anger anytime soon , either . " 
- On June 26 , 2007 , the band released their sixth studio album , Insomnia . It peaked at No. 16 on the Independent Albums chart , and at No. 138 on the Billboard 200 . The album 's lead single , " Suffa " , became one of the most requested tracks at Sirius Satellite Radio 's Hard Attack , while the song 's music video was voted one of the Top 10 of 2007 on MTV 's Headbangers Ball . Hed PE released their first live album , The D.I.Y. Guys , in 2008 . On January 13 , 2009 , they released their seventh studio album , New World Orphans . It was released in three different versions ; each contains a different set of bonus tracks . In 2009 , drummer Trauma joined the band . He is the sixth person to fill this position . The band 's eighth studio album , Truth Rising , was released on October 26 , 2010 to mixed reviews . Hed pe played the " Local Heroes Tour " in the fall of 2012 and played with Flipsyde in San jose on Sunday October 7 , 2012 . In an interview , frontman Jared Gomes stated that their album for 2013 titled Ascension would be released within the first half of 2014 . Towards the end of 2013 , DJ Product mysteriously left the band with no explanation and no comment from the other members . On 1 / 1 / 2014 , Frontman Jahred Gomes stated on the band 's official Facebook that the new upcoming ( hed ) PE album will be named " Evolution " and to be released within the year . 
- On May 13 , 2014 , On the band 's official Facebook page , they released the official announcement of when the band 's new album Evolution will hit stores . The album is set for release July 22 , 2014 . They also released a teaser of the tone of the new album on their Facebook page and soon after , the track " One More Body " . 
- In 2015 , it was confirmed that 12 @-@ year guitarist Jaxon Benge and original bassist Mark Young had left the band . They were replaced by guitarist Greg " Gregzilla " Harrison and bassist Kurt Blankenship , leaving vocalist and founding member Jared Gomes as the group 's only remaining original talent . 
- 
- = = Style = = 
- 
- 
- = = = Music and lyrics = = = 
- 
- Hed PE performs a style of music which they have referred to as " G @-@ punk " , a phrase inspired by the term " G @-@ funk " , itself a reference to the P @-@ Funk collective . Hed PE 's music is a fusion of styles ranging from hip hop , reggae , and ska to hard rock , punk , and heavy metal . Other elements that have been incorporated into this style include blues , funk , jazz and industrial . Jared Gomes ' vocal style ranges from melodic singing to rapping , screaming , and death growls . The band 's lyrics draw from a number of subjects , including the existence of extraterrestrial life , criticism of organized religion , the 9 / 11 Truth movement , cannabis use and sexual intercourse . 
- Gomes , in addition to the 9 / 11 Truth movement , has expressed support for social liberal politicians such as Nancy Pelosi and president Barack Obama . Previously however , Gomes ' 2004 lyrics for Only in Amerika expressed support for nationalism , and called for retaliation against Al Qaeda for the 9 / 11 terrorist attacks . 
- 
- = = = Influences = = = 
- 
- The band 's influences include HEM , Beastie Boys , Black Sabbath , Bob Marley , Led Zeppelin , Nine Inch Nails , Snoop Dogg , Cypress Hill , Notorious B.I.G. and Rage Against the Machine . Hed PE 's second album , Broke , incorporated classic rock and world music influences , while Back 2 Base X was influenced by classic punk bands such as the Sex Pistols and the Clash , Insomnia was influenced by thrash metal bands such as Slayer , and New World Orphans was influenced by Suicidal Tendencies and Minor Threat . Guitarist Jaxon has been credited for encouraging a heavier , hardcore punk @-@ influenced musical style . 
- 
- = = Band members = = 
- 
- Jared ( Paulo Sergio Gomes ) — lead vocals ( 1994 – current ) 
- Major Trauma ( Jeremiah Stratton ) — drums ( 2008 – current ) 
- Gregzilla ( Greg Harrison ) — guitar ( 2015 – current ) 
- Kid Bass ( Kurt Blankenship ) — bass ( 2015 – current ) 
- 
- = = = Former members = = = 
- 
- Ken Sachs ( The Finger ) — keyboard ( 1994 – 1996 ) 
- Chad Benekos ( Chizad ) — guitar ( 1994 – 2002 ) 
- Wesley Geer ( Wesstyle , Wes Geer ) — guitar ( 1994 – 2003 ) 
- Ben C. Vaught ( B.C. ) — drums ( 1994 – 2003 ) 
- Doug Boyce ( DJ Product © 1969 ) — turntables , samples ( 1994 – 2013 ) 
- Mark Young ( Mawk ) — bass ( 1994 – 2015 ) 
- Sonny Mayo — guitar ( 2002 – 2003 ) 
- Jackson Benge ( Jaxon ) — guitar ( 2004 – 2015 ) 
- Christopher Hendrich — drums ( 2004 ) 
- Mark " Moke " Bistany — drums ( 2004 – 2006 ) 
- Devin Lebsack — drums ( 2006 – 2007 ) 
- Anthony " Tiny Bubz " Biuso — drums ( 2007 – 2008 ) 
- 
- = = = Timeline = = = 
- 
- 
- = = Discography = = 
- 
- Studio albums 
- Church of Realities ( 1995 ) 
- Hed PE ( 1997 ) 
- Broke ( 2000 ) 
- Blackout ( 2003 ) 
- Only in Amerika ( 2004 ) 
- Back 2 Base X ( 2006 ) 
- Insomnia ( 2007 ) 
- New World Orphans ( 2009 ) 
- Truth Rising ( 2010 ) 
- Evolution ( 2014 ) 
- Forever ! ( 2016 ) 
- 
- 
- = Ironclad warship = 
- 
- An ironclad is a steam @-@ propelled warship protected by iron or steel armor plates used in the early part of the second half of the 19th century . The ironclad was developed as a result of the vulnerability of wooden warships to explosive or incendiary shells . The first ironclad battleship , Gloire , was launched by the French Navy in November 1859 . The British Admiralty had been considering armored warships since 1856 and prepared a draft design for an armored corvette in 1857 ; in early 1859 the Royal Navy started building two iron @-@ hulled armored frigates , and by 1861 had made the decision to move to an all @-@ armored battle fleet . After the first clashes of ironclads ( both with wooden ships and with one another ) took place in 1862 during the American Civil War , it became clear that the ironclad had replaced the unarmored ship of the line as the most powerful warship afloat . This type of ship would come to be very successful in the American Civil War . 
- Ironclads were designed for several roles , including as high seas battleships , coastal defense ships , and long @-@ range cruisers . The rapid evolution of warship design in the late 19th century transformed the ironclad from a wooden @-@ hulled vessel that carried sails to supplement its steam engines into the steel @-@ built , turreted battleships and cruisers familiar in the 20th century . This change was pushed forward by the development of heavier naval guns ( the ironclads of the 1880s carried some of the heaviest guns ever mounted at sea ) , more sophisticated steam engines , and advances in metallurgy which made steel shipbuilding possible . 
- The quick pace of change meant that many ships were obsolete as soon as they were finished , and that naval tactics were in a state of flux . Many ironclads were built to make use of the ram or the torpedo , which a number of naval designers considered the important weapons of naval combat . There is no clear end to the ironclad period , but towards the end of the 1890s the term ironclad dropped out of use . New ships were increasingly constructed to a standard pattern and designated battleships or armored cruisers . 
- 
- = = The ironclad = = 
- 
- The ironclad became technically feasible and tactically necessary because of developments in shipbuilding in the first half of the 19th century . According to naval historian J. Richard Hill : " The ( ironclad ) had three chief characteristics : a metal @-@ skinned hull , steam propulsion and a main armament of guns capable of firing explosive shells . It is only when all three characteristics are present that a fighting ship can properly be called an ironclad . " Each of these developments was introduced separately in the decade before the first ironclads . 
- 
- = = = Steam propulsion = = = 
- 
- In the 18th and early 19th centuries fleets had relied on two types of major warship , the ship of the line and the frigate . The first major change to these types was the introduction of steam power for propulsion . While paddle steamer warships had been used from the 1830s onwards , steam propulsion only became suitable for major warships after the adoption of the screw propeller in the 1840s . 
- Steam @-@ powered screw frigates were built in the mid @-@ 1840s , and at the end of the decade the French Navy introduced steam power to its line of battle . The desire for change came from the ambition of Napoleon III to gain greater influence in Europe , which required a challenge to the British at sea . The first purpose @-@ built steam battleship was the 90 @-@ gun Napoléon in 1850 . Napoléon was armed as a conventional ship @-@ of @-@ the @-@ line , but her steam engines could give her a speed of 12 knots ( 22 km / h ) , regardless of the wind conditions : a potentially decisive advantage in a naval engagement . 
- The introduction of the steam ship @-@ of @-@ the @-@ line led to a building competition between France and Britain . Eight sister ships to Napoléon were built in France over a period of ten years , but the United Kingdom soon managed to take the lead in production . Altogether , France built ten new wooden steam battleships and converted 28 from older ships of the line , while the United Kingdom built 18 and converted 41 . 
- 
- = = = Explosive shells = = = 
- 
- The era of the wooden steam ship @-@ of @-@ the @-@ line was brief , because of new , more powerful naval guns . In the 1820s and 1830s , warships began to mount increasingly heavy guns , replacing 18- and 24 @-@ pounder guns with 32 @-@ pounders on sailing ships @-@ of @-@ the @-@ line and introducing 68 @-@ pounders on steamers . Then , the first shell guns firing explosive shells were introduced following their development by the French Général Henri @-@ Joseph Paixhans , and by the 1840s were part of the standard armament for naval powers including the French Navy , Royal Navy , Imperial Russian Navy and United States Navy . It is often held that the power of explosive shells to smash wooden hulls , as demonstrated by the Russian destruction of an Ottoman squadron at the Battle of Sinop , spelled the end of the wooden @-@ hulled warship . The more practical threat to wooden ships was from conventional cannon firing red @-@ hot shot , which could lodge in the hull of a wooden ship and cause a fire or ammunition explosion . Some navies even experimented with hollow shot filled with molten metal for extra incendiary power . 
- 
- = = = Iron armor = = = 
- 
- The use of iron instead of wood as the primary material of ships ' hulls began in the 1830s ; the first " warship " with an iron hull was the gunboat Nemesis , built by Laird for the East India Company in 1839 . There followed , also from Laird , the first full @-@ blown warships with metal hulls , the 1842 steam frigates Guadelupe and Montezuma for the Mexican navy . But a thin iron skin , while not being susceptible to fire or lethal splintering like wood , was not the same thing as providing iron armor calculated to stop enemy gunfire . 
- Following the demonstration of the power of explosive shells against wooden ships at the Battle of Sinop , and fearing that his own ships would be vulnerable to the Paixhans guns of Russian fortifications in the Crimean War , Emperor Napoleon III ordered the development of light @-@ draft floating batteries , equipped with heavy guns and protected by heavy armor . Experiments made during the first half of 1854 proved highly satisfactory , and on 17 July 1854 , the French communicated to the British Government that a solution had been found to make gun @-@ proof vessels and that plans would be communicated . After tests in September 1854 , the British Admiralty agreed to build five armoured floating batteries on the French plans , establishing the important Thames and Millwall Iron Works within the docks . 
- The French floating batteries were deployed in 1855 as a supplement to the wooden steam battle fleet in the Crimean War . The role of the battery was to assist unarmored mortar and gunboats bombarding shore fortifications . The French used three of their ironclad batteries ( Lave , Tonnante and Dévastation ) in 1855 against the defenses at the Battle of Kinburn on the Black Sea , where they were effective against Russian shore defences . They would later be used again during the Italian war in the Adriatic in 1859 . The British floating batteries Glatton and Meteor arrived too late to participate to the action at Kinburn . The British planned to use theirs in the Baltic Sea against the well @-@ fortified naval base at Kronstadt . 
- The batteries have a claim to the title of the first ironclad warships but they were capable of only 4 knots ( 7 km / h ) under their own power : they operated under their own power at the Battle of Kinburn , but had to be towed for long range transit . They were also arguably marginal to the work of the navy . The brief success of the floating ironclad batteries convinced France to begin work on armored warships for their battlefleet . 
- 
- = = Early ironclad ships and battles = = 
- 
- By the end of the 1850s it was clear that France was unable to match British building of steam warships , and to regain the strategic initiative a dramatic change was required . The result was the first ocean @-@ going ironclad , the Gloire , begun in 1857 and launched in 1859 . 
- Gloire 's wooden hull was modelled on that of a steam ship of the line , reduced to one deck , sheathed in iron plates 4 @.@ 5 inches ( 110 mm ) thick . She was propelled by a steam engine , driving a single screw propeller for a speed of 13 knots ( 24 km / h ) . She was armed with thirty @-@ six 6 @.@ 4 @-@ inch ( 160 mm ) rifled guns . France proceeded to construct 16 ironclad warships , including two more sister ships to Gloire , and the only two @-@ decked broadside ironclads ever built , Magenta and Solférino . 
- The Royal Navy had not been keen to sacrifice its advantage in steam ships of the line , but was determined that the first British ironclad would outmatch the French ships in every respect , particularly speed . A fast ship would have the advantage of being able to choose a range of engagement which could make her invulnerable to enemy fire . The British specification was more a large , powerful frigate than a ship @-@ of @-@ the @-@ line . The requirement for speed meant a very long vessel , which had to be built from iron . The result was the construction of two Warrior @-@ class ironclads ; HMS Warrior and HMS Black Prince . The ships had a successful design , though there were necessarily compromises between ' sea @-@ keeping ' , strategic range and armour protection ; their weapons were more effective than that of Gloire , and with the largest set of steam engines yet fitted to a ship they could steam at 14 @.@ 3 knots ( 26 @.@ 5 km / h ) . Yet the Gloire and her sisters had full iron @-@ armour protection along the waterline and the battery itself . Warrior and Black Prince ( but also the smaller Defence and Resistance ) were obliged to concentrate their armour in a central ' citadel ' or ' armoured box ' , leaving many main deck guns and the fore and aft sections of the vessel unprotected . The use of iron in the construction of Warrior also came with some drawbacks ; iron hulls required more regular and intensive repairs than wooden hulls , and iron was more susceptible to fouling by marine life . 
- By 1862 , navies across Europe had adopted ironclads . Britain and France each had sixteen either completed or under construction , though the British vessels were larger . Austria , Italy , Russia , and Spain were also building ironclads . However , the first battles using the new ironclad ships involved neither Britain nor France , and involved ships markedly different from the broadside @-@ firing , masted designs of Gloire and Warrior . The use of ironclads by both sides in the American Civil War , and the clash of the Italian and Austrian fleets at the Battle of Lissa , had an important influence on the development of ironclad design . 
- 
- = = = First battles between ironclads : the U.S. Civil War = = = 
- 
- The first use of ironclads in action came in the U.S. Civil War . The U.S. Navy at the time the war broke out had no ironclads , its most powerful ships being six steam @-@ powered unarmoured frigates . Since the bulk of the Navy remained loyal to the Union , the Confederacy sought to gain advantage in the naval conflict by acquiring modern armored ships . In May 1861 , the Confederate Congress voted that $ 2 million be appropriated for the purchase of ironclads from overseas , and in July and August 1861 the Confederacy started work on construction and converting wooden ships . 
- On 12 October 1861 , the CSS Manassas became the first ironclad to enter combat , when she fought Union warships on the Mississippi during the Battle of the Head of Passes . She had been converted from a commercial vessel in New Orleans for river and coastal fighting . In February 1862 , the larger CSS Virginia joined the Confederate Navy , having been rebuilt at Norfolk . Constructed on the hull of USS Merrimack , Virginia originally was a conventional warship made of wood , but she was converted into an iron @-@ covered casemate ironclad gunship , when she entered the Confederate navy . By this time , the Union had completed seven ironclad gunboats of the City class , and was about to complete the USS Monitor , an innovative design proposed by the Swedish inventor John Ericsson . The Union was also building a large armored frigate , the USS New Ironsides , and the smaller USS Galena . 
- The first battle between ironclads happened on 9 March 1862 , as the armored Monitor was deployed to protect the Union 's wooden fleet from the ironclad ram Virginia and other Confederate warships . In this engagement , the second day of the Battle of Hampton Roads , the two ironclads repeatedly tried to ram one another while shells bounced off their armor . The battle attracted attention worldwide , making it clear that the wooden warship was now out of date , with the ironclads destroying them easily . 
- The Civil War saw more ironclads built by both sides , and they played an increasing role in the naval war alongside the unarmored warships , commerce raiders and blockade runners . The Union built a large fleet of fifty monitors modeled on their namesake . The Confederacy built ships designed as smaller versions of the Virginia , many of which saw action , but their attempts to buy ironclads overseas were frustrated as European nations confiscated ships being built for the Confederacy — especially in Russia , the only country to openly support the Union through the war . Only CSS Stonewall was completed , and she arrived in American waters just in time for the end of the war . 
- Through the remainder of the war , ironclads saw action in the Union 's attacks on Confederate ports . Seven Union monitors , including USS Montauk , as well as two other ironclads , the ironclad frigate New Ironsides and a light @-@ draft USS Keokuk , participated in the failed attack on Charleston ; one was sunk . Two small ironclads , CSS Palmetto State and CSS Chicora participated in the defence of the harbor . For the later attack at Mobile Bay , the Union assembled four monitors as well as 11 wooden ships , facing the CSS Tennessee , the Confederacy 's most powerful ironclad and the gunboats CSS Morgan , CSS Gaines , CSS Selma . 
- On the western front , the Union built a formidable force of river ironclads , beginning with several converted riverboats and then contracting engineer James Eads of St. Louis , Missouri to build the City @-@ class ironclads . These excellent ships were built with twin engines and a central paddle wheel , all protected by an armored casement . They had a shallow draft , allowing them to journey up smaller tributaries , and were very well suited for river operations . Eads also produced monitors for use on the rivers , the first two of which differed from the ocean @-@ going monitors in that they contained a paddle wheel ( the USS Neosho and USS Osage ) . 
- Arguably Eads vessels were some of the better ironclads of the Western Flotilla , but there were a number of other vessels that served valiantly with the fleet . All were of varying design , some more successful than others , and some were similar to standard riverboats but with armored side @-@ mounted paddle wheels . All were armed with various smoothbore and some rifled guns . If nothing else the experience of the American Civil War and its wild variety of competing ironclad designs , some more successful ( or disastrous ) than others , confirmed the emerging trade @-@ off or compromises required in applying the latest technological advances in iron armour manufacture , ship construction and gun design — to name a few — also going on in Europe . There was no such thing as a ' perfect ' ironclad which could be invincible in every possible encounter ; ship duels , standing up to forts , Brown & Blue @-@ water operations . 
- The Union ironclads played an important role in the Mississippi and tributaries by providing tremendous fire upon Confederate forts , installations and vessels with relative impunity to enemy fire . They were not as heavily armored as the ocean @-@ going monitors of the Union , but they were adequate for their intended use . More Western Flotilla Union ironclads were sunk by torpedoes ( mines ) than by enemy fire , and the most damaging fire for the Union ironclads was from shore installations , not Confederate vessels . 
- 
- = = = Lissa : First fleet battle = = = 
- 
- The first fleet battle , and the first ocean battle , involving ironclad warships was the Battle of Lissa in 1866 . Waged between the Austrian and Italian navies , the battle pitted combined fleets of wooden frigates and corvettes and ironclad warships on both sides in the largest naval battle between the battles of Navarino and Tsushima . 
- The Italian fleet consisted of 12 ironclads and a similar number of wooden warships , escorting transports which carried troops intending to land on the Adriatic island of Lissa . Among the Italian ironclads were seven broadside ironclad frigates , four smaller ironclads , and the newly built Affondatore — a double @-@ turretted ram . Opposing them , the Austrian navy had seven ironclad frigates . 
- The Austrians believed their ships to have less effective guns than their enemy , so decided to engage the Italians at close range and ram them . The Austrian fleet formed into an arrowhead formation with the ironclads in the first line , charging at the Italian ironclad squadron . In the melée which followed both sides were frustrated by the lack of damage inflicted by guns , and by the difficulty of ramming — nonetheless , the effective ramming attack being made by the Austrian flagship against the Italian attracted great attention in following years . 
- The superior Italian fleet lost its two ironclads , Re d 'Italia and Palestro , while the Austrian unarmoured screw two @-@ decker SMS Kaiser remarkably survived close actions with four Italian ironclads . The battle ensured the popularity of the ram as a weapon in European ironclads for many years , and the victory won by Austria established it as the predominant naval power in the Adriatic . 
- The battles of the American Civil War and at Lissa were very influential on the designs and tactics of the ironclad fleets that followed . In particular , it taught a generation of naval officers the misleading lesson that ramming was the best way to sink enemy ironclads . 
- 
- = = Armament and tactics = = 
- 
- The adoption of iron armor meant that the traditional naval armament of dozens of light cannon became useless , since their shot would bounce off an armored hull . To penetrate armor , increasingly heavy guns were mounted on ships ; nevertheless , the view that ramming was the only way to sink an ironclad became widespread . The increasing size and weight of guns also meant a movement away from the ships mounting many guns broadside , in the manner of a ship @-@ of @-@ the @-@ line , towards a handful of guns in turrets for all @-@ round fire . 
- 
- = = = Ram craze = = = 
- 
- From the 1860s to the 1880s many naval designers believed that the development of the ironclad meant that the ram was again the most important weapon in naval warfare . With steam power freeing ships from the wind , and armor making them invulnerable to shellfire , the ram seemed to offer the opportunity to strike a decisive blow . 
- The scant damage inflicted by the guns of Monitor and Virginia at Battle of Hampton Roads and the spectacular but lucky success of the Austrian flagship SMS Erzherzog Ferdinand Max sinking the Italian Re d 'Italia at Lissa gave strength to the ramming craze . From the early 1870s to early 1880s most British naval officers thought that guns were about to be replaced as the main naval armament by the ram . Those who noted the tiny number of ships that had actually been sunk by ramming struggled to be heard . 
- The revival of ramming had a significant effect on naval tactics . Since the 17th century the predominant tactic of naval warfare had been the line of battle , where a fleet formed a long line to give it the best fire from its broadside guns . This tactic was totally unsuited to ramming , and the ram threw fleet tactics into disarray . The question of how an ironclad fleet should deploy in battle to make best use of the ram was never tested in battle , and if it had been , combat might have shown that rams could only be used against ships which were already stopped dead in the water . 
- The ram finally fell out of favour in the 1880s , as the same effect could be achieved with a torpedo , with less vulnerability to quick @-@ firing guns . 
- 
- = = = Development of naval guns = = = 
- 
- The armament of ironclads tended to become concentrated in a small number of powerful guns capable of penetrating the armor of enemy ships at range ; calibre and weight of guns increased markedly to achieve greater penetration . Throughout the ironclad era navies also grappled with the complexities of rifled versus smoothbore guns and breech @-@ loading versus muzzle @-@ loading . 
- HMS Warrior carried a mixture of 110 @-@ pounder 7 inch ( 180 mm ) breech @-@ loading rifles and more traditional 68 @-@ pounder smoothbore guns . Warrior highlighted the challenges of picking the right armament ; the breech @-@ loaders she carried , designed by Sir William Armstrong , were intended to be the next generation of heavy armament for the Royal Navy , but were shortly withdrawn from service . 
- Breech @-@ loading guns seemed to offer important advantages . A breech @-@ loader could be reloaded without moving the gun , a lengthy process particularly if the gun then needed to be re @-@ aimed . The Warrior 's Armstrong guns also had the virtue of being lighter than an equivalent smoothbore and , because of their rifling , more accurate . Nonetheless , the design was rejected because of problems which plagued breech @-@ loaders for decades . 
- The weakness of the breech @-@ loader was the obvious problem of sealing the breech . All guns are powered by the explosive conversion of gunpowder into gas . This explosion propels the shot or shell out of the front of the gun , but also imposes great stresses on the gun @-@ barrel . If the breech — which experiences some of the greatest forces in the gun — is not entirely secure , then there is a risk that either gas will discharge through the breech or that the breech will break . This in turn reduces the muzzle velocity of the weapon and can also endanger the gun crew . The Warrior 's Armstrong guns suffered from both problems ; the shells were unable to penetrate the 4 @.@ 5 in ( 118 mm ) armor of Gloire , while sometimes the screw which closed the breech flew backwards out of the gun on firing . Similar problems were experienced with the breech @-@ loading guns which became standard in the French and German navies . 
- These problems influenced the British to equip ships with muzzle @-@ loading weapons of increasing power until the 1880s . After a brief introduction of 100 @-@ pounder or 9 @.@ 5 @-@ inch ( 240 mm ) smoothbore Somerset Gun , which weighed 6 @.@ 5 tons ( 6 @.@ 6 t ) , the Admiralty introduced 7 @-@ inch ( 178 mm ) rifled guns , weighing 7 tons . These were followed by a series of increasingly mammoth weapons — guns weighing 12 , 25 , 25 , 38 and finally 81 tons , with calibre increasing from 8 @-@ inch ( 203 mm ) to 16 @-@ inch ( 406 mm ) . 
- The decision to retain muzzle @-@ loaders until the 1880s has been criticised by historians . However , at least until the late 1870s , the British muzzle @-@ loaders had superior performance in terms of both range and rate of fire than the French and Prussian breech @-@ loaders , which suffered from the same problems as had the first Armstrong guns . 
- From 1875 onwards , the balance between breech- and muzzle @-@ loading changed . Captain de Bange invented a method of reliably sealing a breech , adopted by the French in 1873 . Just as compellingly , the growing size of naval guns made muzzle @-@ loading much more complicated . With guns of such size there was no prospect of hauling in the gun for re @-@ loading , or even re @-@ loading by hand , and complicated hydraulic systems were required for re @-@ loading the gun outside the turret without exposing the crew to enemy fire . In 1882 , the 81 @-@ ton , 16 @-@ inch ( 406 mm ) guns of HMS Inflexible fired only once every 11 minutes while bombarding Alexandria during the Urabi Revolt . The 100 @-@ ton , 450 mm ( 17 @.@ 72 inch ) guns of Caio Duilio could each fire a round every 15 minutes . 
- In the Royal Navy , the switch to breech @-@ loaders was finally made in 1879 ; as well as the significant advantages in terms of performance , opinion was swayed by an explosion on board HMS Thunderer caused by a gun being double @-@ loaded , a problem which could only happen with a muzzle @-@ loading gun . 
- The calibre and weight of guns could only increase so far . The larger the gun , the slower it would be to load , the greater the stresses on the ship 's hull , and the less the stability of the ship . The size of the gun peaked in the 1880s , with some of the heaviest calibres of gun ever used at sea . HMS Benbow carried two 16 @.@ 25 @-@ inch ( 413 mm ) breech @-@ loading guns , each weighing 110 tons — no British battleship would ever carry guns as large . The Italian 450 mm ( 17 @.@ 72 inch ) guns would be larger than any gun fitted to a battleship until the 18 @.@ 1 @-@ inch ( 460 mm ) armament of the Japanese Yamato class of World War II . One consideration which became more acute was that even from the original Armstrong models , following the Crimean War , range and hitting power far exceeded simple accuracy , especially at sea where the slightest roll or pitch of the vessel as ' floating weapons @-@ platform ' could negate the advantage of rifling . American ordnance experts accordingly preferred smoothbore monsters whose round shot could at least ' skip ' along the surface of the water . Actual effective combat ranges , they had learned during the Civil War , were comparable to those in the Age of Sail — though a vessel could now be smashed to pieces in only a few rounds . Smoke and the general chaos of battle only added to the problem . As a result , many naval engagements in the ' Age of the Ironclad ' were still fought at ranges within easy eyesight of their targets , and well below the maximum reach of their ships ' guns . 
- Another method of increasing firepower was to vary the projectile fired or the nature of the propellant . Early ironclads used black powder , which expanded rapidly after combustion ; this meant cannons had relatively short barrels , to prevent the barrel itself slowing the shell . The sharpness of the black powder explosion also meant that guns were subjected to extreme stress . One important step was to press the powder into pellets , allowing a slower , more controlled explosion and a longer barrel . A further step forward was the introduction of chemically different brown powder which combusted more slowly again . It also put less stress on the insides of the barrel , allowing guns to last longer and to be manufactured to tighter tolerances . 
- The development of smokeless powder , based on nitroglycerine or nitrocellulose , by the French inventor Paul Vielle in 1884 was a further step allowing smaller charges of propellant with longer barrels . The guns of the pre @-@ Dreadnought battleships of the 1890s tended to be smaller in calibre compared to the ships of the 1880s , most often 12 in ( 305 mm ) , but progressively grew in length of barrel , making use of improved propellants to gain greater muzzle velocity . 
- The nature of the projectiles also changed during the ironclad period . Initially , the best armor @-@ piercing projectile was a solid cast @-@ iron shot . Later , shot of chilled iron , a harder iron alloy , gave better armor @-@ piercing qualities . Eventually the armor @-@ piercing shell was developed . 
- 
- = = = Positioning of armament = = = 
- 
- 
- = = = = Broadside ironclads = = = = 
- 
- The first British , French and Russian ironclads , in a logical development of warship design from the long preceding era of wooden ships of the line , carried their weapons in a single line along their sides and so were called " broadside ironclads . " Both Gloire and HMS Warrior were examples of this type . Because their armor was so heavy , they could only carry a single row of guns along the main deck on each side rather than a row on each deck . 
- A significant number of broadside ironclads were built in the 1860s , principally in Britain and France , but in smaller numbers by other powers including Italy , Austria , Russia and the United States . The advantages of mounting guns on both broadsides was that the ship could engage more than one adversary at a time , and the rigging did not impede the field of fire . 
- Broadside armament also had disadvantages , which became more serious as ironclad technology developed . Heavier guns to penetrate ever @-@ thicker armor meant that fewer guns could be carried . Furthermore , the adoption of ramming as an important tactic meant the need for ahead and all @-@ round fire . These problems led to broadside designs being superseded by designs that gave greater all @-@ round fire , which included central @-@ battery , turret , and barbette designs . 
- 
- = = = = Turrets , batteries and barbettes = = = = 
- 
- There were two main design alternatives to the broadside . In one design , the guns were placed in an armoured casemate amidships : this arrangement was called the ' box @-@ battery ' or ' centre @-@ battery ' . In the other , the guns could be placed on a rotating platform to give them a broad field of fire ; when fully armored , this arrangement was called a turret and when partially armored or unarmored , a barbette . 
- The centre @-@ battery was the simpler and , during the 1860s and 1870s , the more popular method . Concentrating guns amidships meant the ship could be shorter and handier than a broadside type . The first full @-@ scale centre @-@ battery ship was HMS Bellerophon of 1865 ; the French laid down centre @-@ battery ironclads in 1865 which were not completed until 1870 . Centre @-@ battery ships often , but not always , had a recessed freeboard enabling some of their guns to fire directly ahead . 
- The turret was first used in naval combat on the USS Monitor in 1862 , with a type of turret designed by the Swedish engineer John Ericsson . A competing turret design was proposed by the British inventor Cowper Coles with a prototype of this installed on HMS Trusty in 1861 for testing and evaluation purposes . Ericsson 's turret turned on a central spindle , and Coles 's turned on a ring of bearings . Turrets offered the maximum arc of fire from the guns , but there were significant problems with their use in the 1860s . The fire arc of a turret would be considerably limited by masts and rigging , so they were unsuited to use on the earlier ocean @-@ going ironclads . The second problem was that turrets were extremely heavy . Ericsson was able to offer the heaviest possible turret ( guns and armour protection ) by deliberately designing a ship with very low freeboard . The weight thus saved from having a high broadside above the waterline was diverted to actual guns and armour . Low freeboard , however , also meant a smaller hull and therefore a smaller capacity for coal storage — and therefore range of the vessel . In many respects , the turreted , low @-@ freeboard Monitor and the broadside sailer HMS Warrior represented two opposite extremes in what an ' Ironclad ' was all about . The most dramatic attempt to compromise these two extremes , or ' squaring this circle ' , was designed by Captain Cowper Phipps Coles : HMS Captain , a dangerously low freeboard turret ship which nevertheless carried a full rig of sail , and which subsequently capsized not long after her launch in 1870 . Her half @-@ sister HMS Monarch was restricted to firing from her turrets only on the port and starboard beams . The third Royal Navy ship to combine turrets and masts was HMS Inflexible of 1876 , which carried two turrets on either side of the centre @-@ line , allowing both to fire fore , aft and broadside . 
- A lighter alternative to the turret , particularly popular with the French navy , was the barbette . These were fixed armored towers which held a gun on a turntable . The crew was sheltered from direct fire , but vulnerable to plunging fire , for instance from shore emplacements . The barbette was lighter than the turret , needing less machinery and no roof armor — though nevertheless some barbettes were stripped of their armor plate to reduce the top @-@ weight of their ships . The barbette became widely adopted in the 1880s , and with the addition of an armored ' gun @-@ house ' , transformed into the turrets of the pre @-@ Dreadnought battleships . 
- 
- = = = Torpedoes = = = 
- 
- The ironclad age saw the development of explosive torpedoes as naval weapons , which helped complicate the design and tactics of ironclad fleets . The first torpedoes were static mines , used extensively in the American Civil War . That conflict also saw the development of the spar torpedo , an explosive charge pushed against the hull of a warship by a small boat . For the first time , a large warship faced a serious threat from a smaller one — and given the relative inefficiency of shellfire against ironclads , the threat from the spar torpedo was taken seriously . The U.S. Navy converted four of its monitors to become turretless armored spar @-@ torpedo vessels while under construction in 1864 – 5 , but these vessels never saw action . Another proposal , the towed or ' Harvey ' torpedo , involved an explosive on a line or outrigger ; either to deter a ship from ramming or to make a torpedo attack by a boat less suicidal . 
- A more practical and influential weapon was the self @-@ propelled or Whitehead torpedo . Invented in 1868 and deployed in the 1870s , the Whitehead torpedo formed part of the armament of ironclads of the 1880s like HMS Inflexible and the Italian Caio Duilio and Enrico Dandolo . The ironclad 's vulnerability to the torpedo was a key part of the critique of armored warships made by the Jeune Ecole school of naval thought ; it appeared that any ship armored enough to prevent destruction by gunfire would be slow enough to be easily caught by torpedo . In practice , however , the Jeune Ecole was only briefly influential and the torpedo formed part of the confusing mixture of weapons possessed by ironclads . 
- 
- = = Armor and construction = = 
- 
- The first ironclads were built on wooden or iron hulls , and protected by wrought iron armor backed by thick wooden planking . Ironclads were still being built with wooden hulls into the 1870s . 
- 
- = = = Hulls : iron , wood and steel = = = 
- 
- Using iron construction for warships offered advantages for the engineering of the hull . However , unarmored iron had many military disadvantages , and offered technical problems which kept wooden hulls in use for many years , particularly for long @-@ range cruising warships . 
- Iron ships had first been proposed for military use in the 1820s . In the 1830s and 1840s , France , Britain and the United States had all experimented with iron @-@ hulled but unarmored gunboats and frigates . However , the iron @-@ hulled frigate was abandoned by the end of the 1840s , because iron hulls were more vulnerable to solid shot ; iron was more brittle than wood , and iron frames more likely to fall out of shape than wood . 
- The unsuitability of unarmored iron for warship hulls meant that iron was only adopted as a building material for battleships when protected by armor . However , iron gave the naval architect many advantages . Iron allowed larger ships and more flexible design , for instance the use of watertight bulkheads on the lower decks . Warrior , built of iron , was longer and faster than the wooden @-@ hulled Gloire . Iron could be produced to order and used immediately , in contrast to the need to give wood a long period of seasoning . And , given the large quantities of wood required to build a steam warship and the falling cost of iron , iron hulls were increasingly cost @-@ effective . The main reason for the French use of wooden hulls for the ironclad fleet built in the 1860s was that the French iron industry could not supply enough , and the main reason why Britain built its handful of wooden @-@ hulled ironclads was to make best use of hulls already started and wood already bought . 
- Wooden hulls continued to be used for long @-@ range and smaller ironclads , because iron nevertheless had a significant disadvantage . Iron hulls suffered quick fouling by marine life , slowing the ships down — manageable for a European battlefleet close to dry docks , but a difficulty for long @-@ range ships . The only solution was to sheath the iron hull first in wood and then in copper , a laborious and expensive process which made wooden construction remain attractive . Iron and wood were to some extent interchangeable : the Japanese Kongō and Hiei ordered in 1875 were sister @-@ ships , but one was built of iron and the other of composite construction . 
- After 1872 , steel started to be introduced as a material for construction . Compared to iron , steel allows for greater structural strength for a lower weight . The French Navy led the way with the use of steel in its fleet , starting with the Redoutable , laid down in 1873 and launched in 1876 . Redoutable nonetheless had wrought iron armor plate , and part of her exterior hull was iron rather than steel . 
- Even though Britain led the world in steel production , the Royal Navy was slow to adopt steel warships . The Bessemer process for steel manufacture produced too many imperfections for large @-@ scale use on ships . French manufacturers used the Siemens @-@ Martin process to produce adequate steel , but British technology lagged behind . The first all @-@ steel warships built by the Royal Navy were the dispatch vessels Iris and Mercury , laid down in 1875 and 1876 . 
- 
- = = = Armor and protection schemes = = = 
- 
- Iron @-@ built ships used wood as part of their protection scheme . HMS Warrior was protected by 4 @.@ 5 in ( 114 mm ) of wrought iron backed by 15 in ( 381 mm ) of teak , the strongest shipbuilding wood . The wood played two roles , preventing spalling and also preventing the shock of a hit damaging the structure of the ship . Later , wood and iron were combined in ' sandwich ' armor , for instance in HMS Inflexible . 
- Steel was also an obvious material for armor . It was tested in the 1860s , but the steel of the time was too brittle and disintegrated when struck by shells . Steel became practical to use when a way was found to fuse steel onto wrought iron plates , giving a form of compound armor . This compound armor was used by the British in ships built from the late 1870s , first for turret armor ( starting with HMS Inflexible ) and then for all armor ( starting with HMS Colossus of 1882 ) . The French and German navies adopted the innovation almost immediately , with licenses being given for the use of the ' Wilson System ' of producing fused armor . 
- The first ironclads to have all @-@ steel armor were the Italian Caio Duilio and Enrico Dandolo . Though the ships were laid down in 1873 their armor was not purchased from France until 1877 . The French navy decided in 1880 to adopt compound armor for its fleet , but found it limited in supply , so from 1884 the French navy was using steel armor . Britain stuck to compound armor until 1889 . 
- The ultimate ironclad armor was case hardened nickel @-@ steel . In 1890 , the U.S. Navy tested steel armor hardened by the Harvey process and found it superior to compound armor . For several years ' Harvey steel ' was the state of the art , produced in the U.S. , France , Germany , Britain , Austria and Italy . In 1894 , the German firm Krupp developed gas cementing , which further hardened steel armor . The German Kaiser Friedrich III , laid down in 1895 , was the first ship to benefit from the new ' Krupp armor ' and the new armor was quickly adopted ; the Royal Navy using it from HMS Canopus , laid down in 1896 . By 1901 almost all new battleships used Krupp armor , though the U.S. continued to use Harvey armor alongside until the end of the decade . 
- The equivalent strengths of the different armor plates was as follows : 15 in ( 381 mm ) of wrought iron was equivalent to 12 in ( 305 mm ) of either plain steel or compound iron and steel armor , and to 7 @.@ 75 in ( 197 mm ) of Harvey armor or 5 @.@ 75 in ( 146 mm ) of Krupp armor . 
- Ironclad construction also prefigured the later debate in battleship design between tapering and ' all @-@ or @-@ nothing ' armour design . Warrior was only semi @-@ armoured , and could have been disabled by hits on the bow and stern . As the thickness of armor grew to protect ships from the increasingly heavy guns , the area of the ship which could be fully protected diminished . Inflexible 's armor protection was largely limited to the central citadel amidships , protecting boilers and engines , turrets and magazines , and little else . An ingenious arrangement of cork @-@ filled compartments and watertight bulkheads was intended to keep her stable and afloat in the event of heavy damage to her un @-@ armored sections . 
- 
- = = Propulsion : steam and sail = = 
- 
- The first ocean @-@ going ironclads carried masts and sails like their wooden predecessors , and these features were only gradually abandoned . Early steam engines were inefficient ; the wooden steam fleet of the Royal Navy could only carry " 5 to 9 days coal " , and the situation was similar with the early ironclads . Warrior also illustrates two design features which aided hybrid propulsion ; she had retractable screws to reduce drag while under sail ( though in practice the steam engine was run at a low throttle ) , and a telescopic funnel which could be folded down to the deck level . 
- Ships designed for coastal warfare , like the floating batteries of the Crimea , or USS Monitor and her sisters , dispensed with masts from the beginning . The British HMS Devastation , started in 1869 , was the first large , ocean @-@ going ironclad to dispense with masts . Her principal role was for combat in the English Channel and other European waters ; and while her coal supplies gave her enough range to cross the Atlantic , she would have had little endurance on the other side of the ocean . The Devastation and the similar ships commissioned by the British and Russian navies in the 1870s were the exception rather than the rule . Most ironclads of the 1870s retained masts , and only the Italian navy , which during that decade was focused on short @-@ range operations in the Adriatic , built consistently mastless ironclads . 
- During the 1860s , steam engines improved with the adoption of double @-@ expansion steam engines , which used 30 – 40 % less coal than earlier models . The Royal Navy decided to switch to the double @-@ expansion engine in 1871 , and by 1875 they were widespread . However , this development alone was not enough to herald the end of the mast . Whether this was due to a conservative desire to retain sails , or was a rational response to the operational and strategic situation , is a matter of debate . A steam @-@ only fleet would require a network of coaling stations worldwide , which would need to be fortified at great expense to stop them falling into enemy hands . Just as significantly , because of unsolved problems with the technology of the boilers which provided steam for the engines , the performance of double @-@ expansion engines was rarely as good in practice as it was in theory . 
- During the 1870s the distinction grew between ' first @-@ class ironclads ' or ' battleships ' on the one hand , and ' cruising ironclads ' designed for long @-@ range work on the other . The demands on first @-@ class ironclads for very heavy armor and armament meant increasing displacement , which reduced speed under sail ; and the fashion for turrets and barbettes made a sailing rig increasingly inconvenient . HMS Inflexible , launched in 1876 but not commissioned until 1881 , was the last British battleship to carry masts , and these were widely seen as a mistake . The start of the 1880s saw the end of sailing rig on ironclad battleships . 
- Sails persisted on ' cruising ironclads ' for much longer . During the 1860s , the French navy had produced the Alma and La Galissonnière classes as small , long @-@ range ironclads as overseas cruisers and the British had responded with ships like HMS Swiftsure of 1870 . The Russian ship General @-@ Admiral , laid down in 1870 and completed in 1875 , was a model of a fast , long @-@ range ironclad which was likely to be able to outrun and outfight ships like Swiftsure . Even the later HMS Shannon , often described as the first British armored cruiser , would have been too slow to outrun General @-@ Admiral . While Shannon was the last British ship with a retractable propellor , later armored cruisers of the 1870s retained sailing rig , sacrificing speed under steam in consequence . It took until 1881 for the Royal Navy to lay down a long @-@ range armored warship capable of catching enemy commerce raiders , HMS Warspite , which was completed in 1888 . While sailing rigs were obsolescent for all purposes by the end of the 1880s , rigged ships were in service until the early years of the 20th century . 
- The final evolution of ironclad propulsion was the adoption of the triple @-@ expansion steam engine , a further refinement which was first adopted in HMS Sans Pareil , laid down in 1885 and commissioned in 1891 . Many ships also used a forced draught to get additional power from their engines , and this system was widely used until the introduction of the steam turbine in the mid @-@ 1900s ( decade ) . 
- 
- = = Fleets = = 
- 
- While ironclads spread rapidly in navies worldwide , there were few pitched naval battles involving ironclads . Most European nations settled differences on land , and the Royal Navy struggled to maintain a deterrent parity with at least France , while providing suitable protection to Britain 's commerce and colonial outposts worldwide . Ironclads remained , for the British Royal Navy , a matter of defending the British Isles first and projecting power abroad second . Those naval engagements of the latter half of the 19th @-@ century which involved ironclads normally involved colonial actions or clashes between second @-@ rate naval powers . But these encounters were often enough to convince British policy @-@ makers of the increasing hazards of strictly naval foreign intervention , from Hampton Roads in the American Civil War to the hardening combined defences of naval arsenals such as Kronstadt and Cherbourg . 
- There were many types of ironclads : 
- Seagoing ships intended to " stand in the line of battle " ; the precursors of the battleship . 
- Coastal service and riverine vessels , including ' floating batteries ' and ' monitors' 
- Vessels intended for commerce raiding or protection of commerce , called ' armoured cruisers' 
- 
- = = = Navies = = = 
- 
- The United Kingdom possessed the largest navy in the world for the whole of the ironclad period . The Royal Navy was the second to adopt ironclad warships , and it applied them worldwide in their whole range of roles . In the age of sail , the British strategy for war depended on the Royal Navy mounting a blockade of the ports of the enemy . Because of the limited endurance of steamships , this was no longer possible , so the British at times considered the risk @-@ laden plan of engaging an enemy fleet in harbor as soon as war broke out . To this end , the Royal Navy developed a series of ' coast @-@ defence battleships ' , starting with the Devastation class . These ' breastwork monitors ' were markedly different from the other high @-@ seas ironclads of the period and were an important precursor of the modern battleship . As long @-@ range monitors they could reach Bermuda unescorted , for example . However , they were still armed with only four heavy guns and were as vulnerable to mines and obstructions ( and enemy monitors ) as the original monitors of the Union Navy proved to be during the Civil War . The British prepared for an overwhelming mortar bombardment of Kronstadt by the close of the Crimean War , but never considered running the smoke @-@ ridden , shallow @-@ water gauntlet straight to St. Petersburg with ironclads . Likewise , monitors proved acutely unable to ' overwhelm ' enemy fortifications single @-@ handed during the American conflict , though their low @-@ profile and heavy armour protection made them ideal for running gauntlets . Mines and obstructions , however , negated these advantages — a problem the British Admiralty frequently acknowledged but never countered throughout the period . The British never laid down enough Devastation @-@ class ' battleships ' to instantly overwhelm Cherbourg , Kronstadt or even New York City with gunfire . Although throughout the 1860s and 1870s the Royal Navy was still in many respects superior to its potential rivals , by the early 1880s widespread concern about the threat from France and Germany culminated in the Naval Defence Act , which promulgated the idea of a ' two @-@ power standard ' , that Britain should possess as many ships as the next two navies combined . This standard provoked aggressive shipbuilding in the 1880s and 1890s . 
- British ships did not participate in any major wars in the ironclad period . The Royal Navy 's ironclads only saw action as part of colonial battles or one @-@ sided engagements like the bombardment of Alexandria in 1882 . Defending British interests against Ahmed ' Urabi 's Egyptian revolt , a British fleet opened fire on the fortifications around the port of Alexandria . A mixture of centre @-@ battery and turret ships bombarded Egyptian positions for most of a day , forcing the Egyptians to retreat ; return fire from Egyptian guns was heavy at first , but inflicted little damage , killing only five British sailors . Few Egyptian guns were actually dismounted , on the other hand , and the fortifications themselves were typically left intact . Had the Egyptians actually utilised the heavy mortars that were at their disposal , they might have quickly turned the tide , for the attacking British ironclads found it easy ( for accuracy 's sake ) to simply anchor whilst firing — perfect targets for high @-@ angle fire upon their thinly armoured topdecks . 
- The French navy built the first ironclad to try to gain a strategic advantage over the British , but were consistently out @-@ built by the British . Despite taking the lead with a number of innovations like breech @-@ loading weapons and steel construction , the French navy could never match the size of the Royal Navy . In the 1870s , the construction of ironclads ceased for a while in France as the Jeune Ecole school of naval thought took prominence , suggesting that torpedo boats and unarmored cruisers would be the future of warships . Like the British , the French navy saw little action with its ironclads ; the French blockade of Germany in the Franco @-@ Prussian War was ineffective , as the war was settled entirely on land . 
- Russia built a number of ironclads , generally copies of British or French designs . Nonetheless , there were real innovations from Russia ; the first true type of ironclad armored cruiser , the General @-@ Admiral of the 1870s , and a set of remarkably badly designed circular battleships referred to as ' popovkas ' ( for Admiral Popov , who conceived the design ) . The Russian Navy pioneered the wide @-@ scale use of torpedo boats during the Russo @-@ Turkish War of 1877 – 1878 , mainly out of necessity because of the superior numbers and quality of ironclads used by the Turkish navy . Russia expanded her navy in the 1880s and 1890s with modern armored cruisers and battleships , but the ships were manned by inexperienced crews and politically appointed leadership , which enhanced their defeat in the Battle of Tsushima on 27 May 1905 . 
- The U.S. Navy ended the Civil War with about fifty monitor @-@ type coastal ironclads ; by the 1870s most of these were laid up in reserve , leaving the USA virtually without an ironclad fleet . Another five large monitors were ordered in the 1870s . The limitations of the monitor type effectively prevented the USA from projecting power overseas , and until the 1890s the USA would have come off badly in a conflict with even Spain or the Latin American powers . The 1890s saw the beginning of what became the Great White Fleet , and it was the modern pre @-@ Dreadnoughts and armored cruisers built in the 1890s which defeated the Spanish fleet in the Spanish – American War of 1898 . This started a new era of naval warfare . 
- Ironclads were widely used in South America . Both sides used ironclads in the Chincha Islands War between Spain and the combined forces of Peru and Chile in the early 1860s . The powerful Spanish Numancia participated in the Battle of Callao but was unable to inflict significant damage to the Callao defences . Besides , Peru was able to deploy two locally built ironclads based on American Civil War designs , the Loa ( a wooden ship converted into a casemate ironclad ) and the Victoria ( a small monitor armed with a single 68 @-@ pdr gun ) , as well as two British @-@ built ironclads : Independencia , a centre @-@ battery ship , and the turret ship Huáscar . Numancia was the first ironclad to circumnavigate the world , arriving in Cádiz on 20 September 1867 , and earning the motto : " Enloricata navis que primo terram circuivit " [ " First ironclad ship to sail around the world " ] ) . In the War of the Pacific in 1879 , both Peru and Chile had ironclad warships , including some of those used a few years previously against Spain . While the Independencia ran aground early on , the Peruvian ironclad ' Huáscar made a great impact against Chilean shipping , delaying Chilean ground invasion by six months . She was eventually caught by two more modern Chilean centre @-@ battery ironclads , the Blanco Encalada and the Almirante Cochrane at the Battle of Angamos Point . 
- Ironclads were also used from the inception of the Imperial Japanese Navy . The Kōtetsu ( Japanese : 甲鉄 , literally " Ironclad " , later renamed Azuma 東 , " East " ) had a decisive role in the Naval Battle of Hakodate Bay in May 1869 , which marked the end of the Boshin War , and the complete establishment of the Meiji Restoration . The IJN continued to develop its strength and commissioned a number of warships from British and European shipyards , first ironclads and later armored cruisers . These ships engaged the Chinese Beiyang fleet which was superior on paper at least at the Battle of the Yalu River . Thanks to superior short @-@ range firepower , the Japanese fleet came off better , sinking or severely damaging eight ships and receiving serious damage to only four . The naval war was concluded the next year at the Battle of Weihaiwei , where the strongest remaining Chinese ships were surrendered to the Japanese . 
- 
- = = End of the ironclad warship = = 
- 
- There is no clearly defined end to the ironclad , besides the transition from wood hulls to all @-@ metal . Ironclads continued to be used in World War I. Towards the end of the 19th century , the descriptions ' battleship ' and ' armored cruiser ' came to replace the term ' ironclad ' . 
- The proliferation of ironclad battleship designs came to an end in the 1890s as navies reached a consensus on the design of battleships , producing the type known as the pre @-@ Dreadnought . These ships are sometimes covered in treatments of the ironclad warship . The next evolution of battleship design , the dreadnought , is never referred to as an ' ironclad ' . 
- Most of the ironclads of the 1870s and 1880s served into the 1900s ( decade ) . For instance , a handful of US navy monitors laid down in the 1870s saw active service in World War I. Pre @-@ Dreadnought battleships and cruisers of the 1890s saw widespread action in World War I and in some cases through to World War II . 
- 
- = = = Legacy = = = 
- 
- The example of the ironclads had some bearing on the history of the tank , as ironclad warships became an inspiration for ideas of landships and other armored vehicles . H. G. Wells , in his short story The Land Ironclads , published in The Strand Magazine in December 1903 , described the use of large , armoured cross @-@ country vehicles , armed with cannon and machine guns , and equipped with pedrail wheels 
- 
- = = Today = = 
- 
- A number of ironclads have been preserved or reconstructed as museum ships . 
- Parts of USS Monitor have been recovered and are being conserved and displayed at the Mariners ' Museum in Newport News , Virginia 
- HMS Warrior is today a fully restored museum ship in Portsmouth , England 
- Huáscar is berthed at the port of Talcahuano , Chile , on display for visitors . 
- The City @-@ class ironclad USS Cairo is currently on display in Vicksburg , Mississippi . 
- Northrop Grumman in Newport News constructed a full @-@ scale replica of USS Monitor . The replica was laid down in February 2005 and completed just two months later . 
- The Dutch Ramtorenschip ( Coastal ram ) Zr . Ms. Buffel is currently under display in the Maritime Museum Rotterdam . 
- The Dutch Ramtorenschip ( Coastal ram ) Zr . Ms. Schorpioen is a museum ship at Den Helder . 
- The complete , recovered wooden hull of the CSS Neuse , a casemate ram ironclad , is on view in Kinston , North Carolina , and , in another part of town on the Neuse River , the recreated ship , named CSS Neuse II , is nearly built and can be visited . 
- The hull of the casemate ironclad CSS Jackson can be seen in the National Civil War Naval Museum at Port Columbus , Georgia . 
- The new United States Navy Zumwalt @-@ class guided missile destroyer has been described as bearing resemblance to ironclads . 
- 
- 
- = Little Gidding ( poem ) = 
- 
- Little Gidding is the fourth and final poem of T. S. Eliot 's Four Quartets , a series of poems that discuss time , perspective , humanity , and salvation . It was first published in September 1942 after being delayed for over a year because of the air @-@ raids on Great Britain during World War II and Eliot 's declining health . The title refers to a small Anglican community in Huntingdonshire , established by Nicholas Ferrar in the 17th century and scattered during the English Civil War . 
- The poem uses the combined image of fire and Pentecostal fire to emphasise the need for purification and purgation . According to the poet , humanity 's flawed understanding of life and turning away from God leads to a cycle of warfare , but this can be overcome by recognising the lessons of the past . Within the poem , the narrator meets a ghost that is a combination of various poets and literary figures . Little Gidding focuses on the unity of past , present , and future , and claims that understanding this unity is necessary for salvation . 
- 
- = = Background = = 
- 
- Following the completion of the third Four Quartets poem , The Dry Salvages , Eliot 's health declined and he stayed in Shamley Green , Surrey while he recovered . During this time , Eliot started writing Little Gidding . The first draft was completed in July 1941 but he was dissatisfied with it . He believed the problems with the poem lay with his own inability to write , and that , precipitated by air raids on London , he had started the poem with too little preparation and had written it too quickly . After the first draft was written , he set the poem aside , and he left in September to lecture throughout Great Britain . 
- After months of not working on the poem , Eliot began to feel compelled to finish it ; it was not until August 1942 , however , that he started working on it again . In total , there were five drafts . The poem was finished by 19 September 1942 and published in the October New English Weekly . Little Gidding was intended to conclude the Four Quartets series , summarising Eliot 's views expressed in this series of poems . 
- Little Gidding was the home of an Anglican community established in 1626 by Nicholas Ferrar . The Ferrar household lived a Christian life according to High Church principles and the Book of Common Prayer . The religious community was dispersed during the English Civil War between Parliamentarians and Royalists but reformed , ending with the death of John Ferrar in 1657 . Eliot had visited the site in May 1936 . 
- Unlike the other locations mentioned in the titles of the Four Quartets poems , Eliot had no direct connection to the original Christian community . As such , the community is supposed to represent almost any religious community . 
- 
- = = Poem = = 
- 
- Critics classify Little Gidding as a poem of fire with an emphasis on purgation and the Pentecostal fire . The beginning of the poem discusses time and winter , with attention paid to the arrival of summer . The images of snow , which provoke desires for a spiritual life , transition into an analysis of the four classical elements of fire , earth , air and water and how fire is the primary element of the four . Following this is a discussion on death and destruction , things unaccomplished , and regret for past events . 
- While using Dante 's terza rima style , the poem continues by describing the Battle of Britain . The image of warfare merges with the depiction of Pentecost , and the Holy Spirit is juxtaposed with the air @-@ raids on London . In the second section , a ghost , representing the poets of the past stuck between worlds , begins talking to the narrator of the poem . The ghost discusses change , art in general , and how humankind is flawed . The only way to overcome the problematic condition of humanity , according to the ghost , is to experience purgation through fire . The fire is described in a manner similar to Julian of Norwich 's writing about God 's love and discussed in relationship to the shirt of Nessus , a shirt that burns its wearer . Little Gidding continues by describing the eternalness of the present and how history exists in a pattern . The poem concludes by explaining how sacrifice is needed to allow an individual to die into life and be reborn , and that salvation should be the goal of humankind . 
- 
- = = Themes = = 
- 
- In terms of renewal , Eliot believed that suffering was needed for all of society before new life could begin . The original Little Gidding community was built for living on monastic lines , but the community was damaged and dispersed by Puritan forces during the English Civil War in 1646 . The church , the centre of the community , was restored in 1714 and again in 1853 . The image of religious renewal is combined with the image of the London air @-@ raids and the constant fighting and destruction within the world . This compound image is used to discuss the connection of holy places with the Holy Spirit , Pentecost , communion with the dead , and the repetition of history . The theme is also internal to Eliot 's own poems ; the image of the rose garden at the end Little Gidding is the image that begins Burnt Norton and the journey is made circular . Also , the depiction of time within the poem is similar to the way time operates within The Family Reunion . 
- Like the other poems making up the Four Quartets , Little Gidding deals with the past , present , and future , and humanity 's place within them as each generation is seemingly united . In the second section , there is a ghost who is the compilation of various poets , including Dante , Swift , Yeats , and others . When the ghost joins the poet , the narrator states " Knowing myself yet being someone other " . This suggests that the different times merge at the same time that the different personalities begin to merge , allowing a communication and connection with the dead . Later , in the fourth section , humanity is given a choice between the Holy Spirit or the bombing of London ; redemption or destruction . God 's love allows humankind to be redeemed and escape the living hell through purgation by fire . The end of the poem describes how Eliot has attempted to help the world as a poet . He parallels his work in language with working on the soul or working on society . 
- The ghost , a combination of many literary figures , was originally addressed in the poem as " Ser Brunetto " before being revised as an ambiguous " you " . " Ser Brunetto " was Dante 's way of addressing Brunetto Latini , a former mentor whom he meets in Hell to which he has been condemned for sodomy . Eliot , in a letter to John Hayward dated 27 August 1942 , explained why he changed the wording : 
- I think you will recognise that it was necessary to get rid of Brunetto for two reasons . The first is that the visionary figure has now become somewhat more definite and will no doubt be identified by some readers with Yeats though I do not mean anything so precise as that . However , I do not wish to take the responsibility of putting Yeats or anybody else into Hell and I do not want to impute to him the particular vice which took Brunetto there . Secondly , although the reference to that Canto is intended to be explicit , I wish the effect of the whole to be Purgatorial which is more appropriate . That brings us to the reference to swimming in fire which you will remember at the end of Purgatorio 26 where the poets are found . 
- The theme of swimming through flames is connected to the depiction of Guido Guinizelli , a poet that influenced Dante , seeking such a state in Purgatorio XXVI . However , the depiction of swimming was transformed into an image of dancing , an act that appears throughout Yeats 's poetry , within purgatorial flames . The critic Dominic Manganiello suggests that , in combining the image of dancing with purgation , Eliot merges Dante 's and Yeats 's poetic themes . 
- 
- = = Reception = = 
- 
- Critics such as Malcolm Cowley and Delmore Schwartz describe mixed emotions about the religiosity of the poem . Cowley emphasised the mystical nature of the poem and how its themes were closer to Buddhism than Anglicanism while mentioning his appreciation of many of the passages . Schwartz also mentioned the Buddhist images and his admiration for many of the lines in Little Gidding . F. B. Pinion believed that the fourth section of the poem costs " Eliot more trouble and vexation than any passage of the same length he ever wrote , and is his greatest achievement in the Four Quartets . " E. M. Forster did not like Eliot 's emphasis on pain and responded to the poem : " Of course there 's pain on and off through each individual 's life ... You can 't shirk it and so on . But why should it be endorsed by the schoolmaster and sanctified by the priest until the fire and the rose are one when so much of it is caused by disease and bullies ? It is here that Eliot becomes unsatisfactory as a seer . " Writing in 2003 , Roger Scruton wrote that in " Little Gidding " Eliot achieved " that for which he envies Dante — namely , a poetry of belief , in which belief and words are one , and in which the thought cannot be prized free from the controlled and beautiful language " . 
- 
- 
- = The Portage to San Cristobal of A.H. = 
- 
- The Portage to San Cristobal of A.H. is a 1981 literary and philosophical novella by George Steiner , in which Jewish Nazi hunters find a fictional Adolf Hitler ( A.H. ) alive in the Amazon jungle thirty years after the end of World War II . The book generated considerable controversy after its publication because in it , Steiner , who is Jewish , allows Hitler to defend himself when he is put on trial in the jungle by his captors . There Hitler maintains that Israel owes its existence to the Holocaust and that he is the " benefactor of the Jews " . 
- The Portage to San Cristobal of A.H. was a 1983 finalist in the PEN / Faulkner Award for Fiction . It was adapted for the theatre by British playwright Christopher Hampton and was staged in London in April 1982 with Alec McCowen playing the part of Adolf Hitler . It was also staged in Hartford , Connecticut in the United States in 1983 and starred John Cullum as Hitler . 
- 
- = = Plot summary = = 
- 
- From his base in Tel Aviv , Holocaust survivor Emmanuel Lieber directs a group of Jewish Nazi hunters in search of Adolf Hitler . Lieber believes that the former Führer is still alive , and following rumours and hearsay , he tracks Hitler 's movements through South America , until after months of wading through swamps in the Amazon jungle , the search party finds the 90 @-@ year @-@ old alive in a clearing . Lieber flies to San Cristóbal where he awaits the group 's return with their captive . But getting the old man out of the jungle alive is more difficult than getting in , and their progress is further hampered by heavy thunderstorms . 
- Meanwhile , broken and incoherent radio messages between Lieber and the search party are intercepted by intelligence agents tracking their progress , and rumours begin to spread across the world of Hitler 's capture . Debates flare up over his impending trial , where it will be held and under whose jurisdiction . Orosso is identified as the nearest airfield to the last known location of the search party , and aircraft begin arriving at the hitherto unknown town . But when the search party loses radio contact with Lieber , they must make a decision : do they sit out the storms and deliver their captive to Lieber later , or do they try Hitler in the jungle before their prize is snatched from them by the world at large , who they know will be waiting ? Their decision is the latter , and against Lieber 's advice ( " You must not let him speak ... his tongue is like no other " ) they prepare for a trial with a judge , prosecution and defence attorneys selected from the members of the search party . Teku , a local Indian tracker , is asked to observe the trial as an independent witness . 
- The attention Hitler is receiving , however , renews his strength , and when the trial begins , he brushes aside his " defence attorney " and begins a long speech in four parts in his own defence : 
- Firstly , Hitler claims he took his doctrines from the Jews and copied the notion of the master race from the Chosen people and their need to separate themselves from the " unclean " . " My racism is a parody of yours , a hungry imitation . " 
- Hitler justifies the Final Solution by maintaining that the Jews ' God , purer than any other , enslaves its subjects , continually demanding more than they can give and " blackmailing " them with ideals that cannot be attained . The " virus of utopia " had to be stopped . 
- Hitler states that he was not the originator of evil . " [ Stalin ] had perfected genocide when I was still a nameless scribbler in Munich . " Further , Hitler asserts that the number of lives lost due to his actions are dwarfed by various world atrocities , including those in Russia , China and Africa . 
- Lastly , Hitler maintains that the Reich begat Israel and suggests that he is the Messiah " whose infamous deeds were allowed by God in order to bring His people home . " He closes by asking , " Should you not honour me who have made ... Zion a reality ? " 
- At the end of his speech , Teku is the first to react and jumps up shouting " Proven " , only to be drowned out by the appearance of a helicopter over the clearing . 
- 
- = = Main characters = = 
- 
- Emmanuel Lieber – Jewish Holocaust survivor and director of the search party to find Hitler ; after crawling out of a death pit in Bialka he never took the time to mend and embarked on a life @-@ consuming obsession to bring those responsible for the genocide to justice . 
- Search party ( all Jewish with family ties to the Holocaust , except for John Asher ) 
- Simeon – search party leader and " presiding judge " at Hitler 's trial ; he is Lieber 's confidant and torn between leading the party into " unmapped quicksand and green bogs " and turning his back on the " quiet mania of Lieber 's conviction " . 
- Gideon Benasseraf – falls ill and dies before the trial begins ; during one of his fever @-@ induced ramblings he suggests that Hitler is Jewish ; he had sought out Lieber after being released from a sanatorium and spending three years recuperating in Paris where the care @-@ free living consumed him with guilt . 
- Elie Barach – Orthodox Jew and " prosecution attorney " at the trial ; he is the moral compass of the group , but his convictions are disturbed by Gideon Benasseraf 's fever @-@ induced assertions that Hitler is Jewish and ends up believing that Hitler may be the second Messiah . 
- Isaac Amsel – an 18 @-@ year @-@ old boy and witness at the trial ; he is the son of Isaac Amsel senior , former member of the search party killed earlier in a skirmish in São Paulo ; he joined the party to avenge his father 's death . 
- John Asher – half @-@ Jewish and reluctant " defence attorney " at the trial ; fascinated by the capture of Bormann and the rumours circulating that Hitler may be alive , he had approached Nazi hunter Wiesenthal who directed him to Lieber ; despite being an " outsider " ( no ties to the Holocaust ) Lieber assigned him to the search party because of his military training and his clear @-@ headedness ( " no metaphysical lusts , no cravings for retribution " ) . 
- Teku – local Indian tracker and independent witness at the trial ; previously the search party 's guide who had abandoned them when they insisted on entering uncharted regions of the jungle , he continued tracking them from a distance before revealing himself . 
- Adolf Hitler – now 90 years old , the former leader of the Third Reich had not died in the Führerbunker in Berlin , but escaped to South America and hid in the Amazon jungle . 
- 
- = = Background and publication = = 
- 
- George Steiner , literary critic for The New Yorker and The New York Times , had written about the Holocaust in some of his previous books , including Anno Domini ( 1964 ) , Language and Silence ( 1967 ) and In Bluebeard 's Castle ( 1971 ) . Many of the ideas Steiner expresses in The Portage to San Cristobal of A.H. were reworked from these earlier works . Steiner told New York Times editor D. J. R. Bruckner that this book arose out of his lifelong work on language . " Central to everything I am and believe and have written is my astonishment ... that you can use human speech both to bless , to love , to build , to forgive and also to torture , to hate , to destroy and to annihilate . " 
- Commenting on the controversy the book generated , Steiner admitted to literary journalist and critic Ron Rosenbaum ( author of Explaining Hitler ) that he too was disturbed by it , adding that his fictional Hitler had gotten the better of him , " golem- or Frankenstein @-@ like " . He said that it felt like the book " wrote me " . Steiner also pointed out that the novella is not only about his thoughts on the Holocaust , but also about the horrific events that took place in countries like Cambodia , Vietnam , El Salvador and Burundi : " My feeling is that one has to grapple with the abyss if one can . " 
- Steiner wrote The Portage to San Cristobal of A.H. in 1975 and 1976 in Geneva , Switzerland , and the 120 @-@ page work originally appeared in the Spring 1979 issue of the United States literary magazine , The Kenyon Review . It also appeared in the Spring 1980 issue of Granta , the British literary magazine . Its first publication in book form , with minor revisions by Steiner , was in May 1981 by Faber and Faber in the United Kingdom — and as requested by Steiner , it was a paperback original . The first United States edition was published in hardcover in April 1982 by Simon & Schuster . 
- 
- = = Adaptations = = 
- 
- The Portage to San Cristobal of A.H. was adapted for the theatre in 1982 by British playwright Christopher Hampton . It was staged in April 1982 at London 's Mermaid Theatre under the direction of John Dexter with Alec McCowen playing the part of Adolf Hitler . McCowen won the 1982 Evening Standard Theatre Award for best actor for this performance . In 1983 the production moved to the United States where it played at the Hartford Stage Company in Hartford , Connecticut , directed by Mark Lamos and starring John Cullum as Hitler . 
- This book is the only work of fiction by Steiner to have been adapted for the stage . 
- 
- = = Reception = = 
- 
- Reaction to The Portage to San Cristobal of A.H. was mixed . Anthony Burgess in The Observer called it " astonishing " , Christopher Booker of The Daily Telegraph described it as a " powerful piece " , and English author A. S. Byatt said it was a " masterpiece " . In Explaining Hitler , Ron Rosenbaum called The Portage " A Frankenstein story " , referring to Steiner 's fictive Hitler has having taken on a life of its own . Writing in Time magazine , Otto Friedrich described the book " a philosophic fantasy of remarkable intensity " , adding that by not refuting Hitler 's speech , Steiner deviates from the horrors of traditional Holocaust literature and ends the book " on a note of bleak ambiguity " . 
- Morris Dickstein of The New York Times was more critical of the book , calling it " a misconceived and badly executed novel , a sideshow distraction from the serious business of thinking through the unspeakable horrors of the Nazi era . " He described it as " wearisome " that is " suffocate [ d ] " by too much " fine writing " ( belles @-@ lettres ) . He also complained that the characters are lifeless , and while they each have detailed histories , they are only " verbal figments " that do not separate them from one another . Finally Dickstein noted that because almost all the points of Hitler 's speech are drawn from some of Steiner 's earlier works , he " unwittingly creates sympathy for Hitler by making him old and pathetic yet also lucid and brilliant — at once absurdly harmless and unconvincingly dangerous . " 
- In another review in The New York Times John Leonard wrote that while the book has its strong points , " some wit , a catholic disdain , multiplicity of character and a South American swamp @-@ life that terrifies " , its weaknesses are that " the characters are really ideas , ... the symbols clash and there are too many echoes of better books by Kafka and Proust " . But Leonard 's biggest criticism of the book was Hitler 's speech , which he called " obscene " , and Steiner 's decision to end the book at that point , which Leonard said " not only denies the power of art to arrange and transcend , but ... makes me sick to my stomach . " 
- Writing in the American literary magazine Salmagundi , Alvin H. Rosenfeld called The Portage a " breakthrough work " that " astonishes " . He was struck by the book 's interplay between the landscape of swamp and jungle , and the " landscape of speech " — the former being " brilliantly registered " with its " immense feeling of physicality " , and the latter , " even more dramatic " in the way it exposes " the dark underside of words " and how its use and misuse reveals the true nature of a person . He was particularly impressed by the depiction of Nazi hunter Emmanuel Lieber and his role as representative of the Jewish consciousness . Rosenfeld noted that while Holocaust literature often either soars to " expostulation and apostrophe " , or sinks to " a dwindling sob of elegiac lament " , Steiner 's Lieber " mediates between these two extremes , ... simultaneously records and mourns , coldly enumerates yet carries an immense affect " . What did concern the reviewer , however , was the way Steiner used ideas from his earlier works , that he had put them " virtually verbatim " into Hitler 's mouth , creating the impression that " Steiner 's understanding of Hitler were identical with the latter 's self @-@ understanding " . Rosenfeld also questioned why the book had to end with Hitler 's speech . He said that Steiner 's fictive Hitler plays " the devil 's game of language subversion " , making " madness [ sound ] like music " , something the real Hitler had perfected . By stopping at this point , Rosenfeld felt that Steiner " succumb [ s ] , rhetorically , to the seductive eloquence of negation " , which undermines his own " high standards of moral intelligence " . But overall Rosenfeld said The Portage " must be counted among the most vigorous attempts to portray the presence and meaning of Hitler " , forcing us to confront him " in a way hardly seen before in fiction " . 
- The Portage to San Cristobal of A.H. was a finalist in the 1983 PEN / Faulkner Award for Fiction . 
- 
- = = Controversy = = 
- 
- The book generated considerable controversy because of its apparent " admiration for Hitler " . The controversy grew further when the faithful stage adaptation ( " too faithful " , according to Steiner ) was performed in the United Kingdom and the United States . 
- Hitler 's speech at the end of the book disturbed many readers and critics . Steiner not only lets Hitler justify his past , he allows him the ( almost ) last word before the outside world invades . The fact that Steiner is Jewish made this speech in particular even more contentious . One critic , while acknowledging that Steiner always saw Hitler as " the incarnation of unprecedented and unparalleled evil " , felt that there was no clear distinction in the book between Steiner 's own views and those of his fictional Hitler , even going so far as to accuse Steiner , who rejects Jewish nationalism and is a critic of Israel 's treatment of the Palestinians , of anti @-@ Semitism . 
- In contrast , a Time magazine article at the time felt that Steiner 's intention for the Hitler speech was to use it to explore his previously stated belief " that Hitler wielded language as an almost supernatural force " , drawing attention to Nazi hunter Emmanuel Lieber 's warning from the book regarding Hitler : " There shall come a man who ... will know the grammar of hell and teach it to others . He will know the sounds of madness and loathing and make them seem music . " 
- Steiner responded to criticism that Hitler 's speech in this book is unchallenged by saying that it had been done before : for example Satan 's speech in Milton 's Paradise Lost ( 1667 ) , and The Grand Inquisitor 's speech in Dostoyevsky 's The Brothers Karamazov ( 1880 ) . He also reminded the reader that Hitler 's speech is balanced out earlier in the book by Lieber 's long monologue on the horrors of the Holocaust . Finally , Steiner said that his Hitler ( A. H. ) is " a fictive figure " , and that it is not he who has the last word , but Teku , the Indian tracker , who shouts " Proven " . Teku is also the Hebrew word used to indicate that " there are issues here beyond our wisdom to answer or decide . " 
- 
- 
- = Temnospondyli = 
- 
- Temnospondyli ( from Greek τέμνειν ( temnein , " to cut " ) and σπόνδυλος ( spondylos , " vertebra " ) ) is a diverse subclass of extinct small to giant tetrapods — often considered primitive amphibians — that flourished worldwide during the Carboniferous , Permian , and Triassic periods . A few species continued into the Cretaceous . Fossils have been found on every continent . During about 210 million years of evolutionary history , they adapted to a wide range of habitats , including fresh water , terrestrial , and even coastal marine environments . Their life history is well understood , with fossils known from the larval stage , metamorphosis , and maturity . Most temnospondyls were semiaquatic , although some were almost fully terrestrial , returning to the water only to breed . These temnospondyls were some of the first vertebrates fully adapted to life on land . Although temnospondyls are considered amphibians , many had characteristics , such as scales , claws , and armour @-@ like bony plates , that distinguish them from modern amphibians . 
- Temnospondyls have been known since the early 19th century , and were initially thought to be reptiles . They were described at various times as batrachians , stegocephalians , and labyrinthodonts , although these names are now rarely used . Animals now grouped in Temnospondyli were spread out among several amphibian groups until the early 20th century , when they were found to belong to a distinct taxon based on the structure of their vertebrae . Temnospondyli means " cut vertebrae " , as each vertebra is divided into several parts . 
- Experts disagree over whether temnospondyls were ancestral to modern amphibians ( frogs , salamanders , and caecilians ) , or whether the whole group died out without leaving any descendants . Different hypotheses have placed modern amphibians as the descendants of temnospondyls , another group of early tetrapods called lepospondyls , or even as descendants of both groups ( with caecilians evolving from lepospondyls and frogs and salamanders evolving from temnospondyls ) . Recent studies place a family of temnospondyls called the amphibamids as the closest relatives of modern amphibians . Similarities in teeth , skulls , and hearing structures link the two groups . 
- 
- = = Description = = 
- 
- Many temnospondyls are much larger than living amphibians , and superficially resemble crocodiles . Others are smaller and resemble salamanders . Most have broad , flat heads that are either blunt ( brevirostrine ) or elongated ( longirostrine ) . The skulls are rounded or triangular in shape when viewed from above , and are usually covered in pits and ridges . The rugged surfaces of bones may have supported blood vessels , which could transfer carbon dioxide to the bones to neutralize acidic build up in the blood ( early semiaquatic tetrapods would have had difficulty expelling carbon dioxide from their bodies while on land , and these dermal bones may have been an early solution to the problem ) . Many temnospondyls also have canal @-@ like grooves in their skulls called sensory sulci . The sulci , which usually run around the nostrils and eye sockets , are part of a lateral line system used to detect vibrations in water . As semiaquatic animals , most temnospondyls have small limbs with four toes on each front foot and five on each hind foot . Terrestrial temnospondyls have larger , thicker limbs , and some even have claws . One unusual terrestrial temnospondyl , Fayella , has relatively long limbs for its body , and probably lived as an active runner able to chase prey . 
- Homologues of most of the bones of temnospondyls are also seen in other early tetrapods , aside from a few bones in the skull , such as interfrontals , internasals , and interparietals , that have developed in some temnospondyl taxa . Most temnospondyls have tabular horns in the backs of their skulls , rounded projections of bone separated from the rest of the skull by indentations called otic notches ; in some temnospondyls , such as Zatrachys , they are pointed and very prominent . Among the most distinguishing features of temnospondyls are the interpterygoid vacuities , two large holes in the back of the palate . Another pair of holes , choanae , are present in front of these vacuities , and connect the nasal passage with the mouth . Temnospondyls often have teeth on their palates , as well as in their jaws . Some of these teeth are so large , they are referred to as tusks . In some temnospondyls , such as Nigerpeton , tusks in the lower jaw pierce the palate and emerge through openings in the top of the skull . 
- Very little is known of the soft tissue of temnospondyls . A block of sandstone , described in 2007 from the Early Carboniferous Mauch Chunk Formation of Pennsylvania , included impressions of the bodies of three temnospondyls . These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides . Trackways referable to small temnospondyls have also been found in Carboniferous and Permian rocks . The trackways , called batrachichni , are usually found in strata deposited around freshwater environments , suggesting the animals had some ties to the water . 
- Unlike modern amphibians , many temnospondyls are covered in small , closely packed scales . The undersides of most temnospondyls are covered in rows of large ventral plates . During early stages of development , they first have only small , rounded scales . Fossils show , as the animals grew , the scales on the undersides of their bodies developed into large , wide ventral plates . The plates overlap each other in a way that allows a wide range of flexibility . Later semiaquatic temnospondyls , such as trematosaurs and capitosaurs , have no evidence of scales . They may have lost scales to make movement easier under water or to allow cutaneous respiration , the absorption of oxygen through the skin . 
- Several groups of temnospondyls have large bony plates on their backs . One temnospondyl , Peltobatrachus , has armour @-@ like plating that covers both its back and underside . The temnospondyl Laidleria also has extensive plating on its back . Most members of the family Dissorophidae also have armor , although it only covers the midline of the back with two narrow rows of plates . Other temnospondyls , such as Eryops , have been found with small , disc @-@ like bony scutes that were in life probably embedded in the skin . All of these temnospondyls were adapted to a terrestrial lifestyle . Armor may have offered protection from predators in the case of Peltobatrachus . The scutes may have provided stability for the spine , as they would have limited flexibility and may have been connected by strong ligaments . Temnospondyls such as Sclerothorax and Eryops that may have been at least partly terrestrial also have long neural spines on top of their vertebrae that would have stabilized the spine . Bony scutes are also seen in plagiosaurs , but unlike Peltobatrachus , Laidleria , Eryops , and dissorophids , these animals are thought to have been fully aquatic . Plagiosaurs may have inherited their armor from a terrestrial ancestor , as both Peltobatrachus and Laidleria have been considered close relatives of the group . 
- Temnospondyls ' vertebrae are divided into several segments . In living tetrapods , the main body of the vertebra is a single piece of bone called the centrum , but in temnospondyls , this region was divided into a pleurocentrum and intercentrum . Two types of vertebrae are recognized in temnospondyls : stereospondylous and rhachitomous vertebrae . In rhachitomous vertebrae , the intercentra are large and wedge @-@ shaped , and the pleurocentra are relatively small blocks that fit between them . Both elements support a spine @-@ like neural arch , and well @-@ developed interlocking projections called zygapophyses strengthen the connections between vertebrae . The strong backbone and strong limbs of many ratchitomous temnospondyls allowed them to be partially , and in some cases fully , terrestrial . In stereospondylous vertebrae , the pleurocentra have been lost entirely , with the intercentra enlarged as the main body of the vertebrae . This weaker type of backbone indicates the stereospondylous temnospondyls spent more time in water . 
- 
- = = History of study = = 
- 
- Temnospondyli was named by German palaeontologist Karl Alfred von Zittel in his second edition of Handbuch der Palaeontologie , published in 1888 . Temnospondyl remains were known since the early part of the 19th century , however . The earliest described temnospondyl was Mastodonsaurus , named by Georg Friedrich Jaeger in 1828 . Jaeger named Mastodonsaurus from a single tooth , and considered it a reptile . Mastodonsaurus means " breast tooth lizard " after the nipple @-@ like shape of the tip of the tooth . 
- The naming of these first specimens was disputed , however . Leopold Fitzinger named the animal Batrachosaurus in 1837 . In 1841 , English palaeontologist Richard Owen referred to the genus as Labyrinthodon to describe its highly folded or labyrinthine teeth . Owen thought the name Mastodonsaurus " ought not to be retained , because it recalls unavoidably the idea of the mammalian genus Mastodon , or else a mammilloid form of the tooth ... and because the second element of the word , saurus , indicates a false affinity , the remains belonging , not to the Saurian , but to the Batrachian order of Reptiles . " Owen recognized the animal was not a " saurian " reptile , yet he also referred Jaeger 's Phytosaurus to the genus . Although the two genera both have similarly sized conical teeth , Phytosaurus was later found to be a crocodile @-@ like reptile . Additional material , including skulls , firmly placed Labyrinthodon as an amphibian . Jaeger also named Salamandroides giganteus in 1828 , basing it on partial occiput , or back portion of the skull . In 1833 , he described a complete skull of S. giganteus that had the same teeth as his Mastodonsaurus , making it the first known complete skull of a temnospondyl . Because Mastodonsaurus was named first , it has precedence over the other names as a senior subjective synonym . Batrachosaurus is still used as the name of an unrelated brachyopid temnospondyl . 
- Mastodonsaurus and other similar animals were referred to as labyrinthodonts , named like Labyrinthodon for teeth that were highly folded in cross section . Owen 's " Labyrinthodon Jaegeri " was later found at Guy 's Cliffe , England by paleontologist William Buckland . Other specimens were found in the red sandstone of Warwickshire . As more fossils were uncovered in England , Owen depicted these labyrinthodonts as the " highest " form of batrachian and compared them to crocodiles , which he considered the highest form of reptiles . He also noted the large labyrinthodonts of the Keuper ( a unit of rocks that dates to the Late Triassic ) were younger than more advanced reptiles in the Magnesian and Zechstein , which are Late Permian in age . Owen used these fossils to counter the notion that reptiles evolved from a sequential progression from early amphibians ( what he called " metamorphosed fishes " ) . 
- In addition to Mastodonsaurus , some of the earliest named genera included Metopias and Rhombopholis in 1842 , Zygosaurus in 1848 , Trematosaurus in 1849 , Baphetes and Dendrerpeton in 1853 , Capitosaurus in 1858 , and Dasyceps in 1859 . Baphetes is now placed as an early tetrapod outside Temnospondyli , and Rhombopholis is now considered a prolacertiform reptile . 
- Later in the 19th century , temnospondyls were classified as various members of Stegocephalia , a name coined by American paleontologist Edward Drinker Cope in 1868 . Cope placed stegocephalians in the class Batrachia , the name then used for Amphibia . Stegocephalia means " roof @-@ headed " in Greek , a reference to the wide , flat heads of temnospondyls and other early tetrapods . During this time , palaeontologists considered temnospondyls to be amphibians because they possessed three main features : gill arches in juvenile skeletons , indicating they were amphibious for at least the first part of their lives ; ribs that do not connect at the underside of the rib cage ; and deep pits in the skull that were interpreted as space for mucous glands . 
- Several suborders of stegocephalians were recognized in the late 19th and early 20th centuries . Animals now regarded as temnospondyls were primarily labyrinthodonts , but some were classified in the Branchiosauria . Branchiosaurs were small @-@ bodied and had simple conical teeth , while labyrinthodonts were larger and had complex , folded dentin and enamel in their teeth . Branchiosauria included only a few forms , such as Branchiosaurus from Europe and Amphibamus from North America , that had poorly developed bones , external gills , and no ribs . Some skeletons of Amphibamus were later found with long ribs , prompting its reassignment to Microsauria ( although more detailed studies found it to be a temnospondyl ) . Soft tissue , such as scales and external gills , were found in many well @-@ preserved branchiosaur fossils from Germany . In the early 20th century , branchiosaurs would be recognized as larval forms of temnospondyls lacking many of the typical features that define the group , and is no longer recognized as a distinct group . 
- Other animals that would later be classified as temnospondyls were placed in a group called Ganocephala , characterized by plate @-@ like skull bones , small limbs , fish @-@ like scales , and branchial arches . Unlike labyrinthodonts , they did not have parietal foramena , small holes in their skulls behind their eye sockets . Archegosaurus , Dendrerpeton , Eryops and Trimerorhachis were placed in this group and were considered to be the most primitive members of Reptilia . Their rhachitomous vertebrae , notochord , and lack of occipital condyles ( which attached the head to the neck ) were features that were also shared with fishes . Thus , they were considered a link between early fishes and more advanced forms such as stegocephalians . 
- Another group called Microsauria was named by Cope in 1868 . Cope classified Microsauria as a subgroup of Labyrinthodontia , placing many small , amphibian @-@ like animals within it . Among them were Dendrerpeton , once placed in Ganocephala . Dendrerpeton was later placed as a labyrinthodont with other temnospondyls , but confusion existed for many years over the classification of small amphibians . 
- By the end of the 19th century , most of what are today regarded as temnospondyls were placed in the suborder Labyrinthodonta . American paleontologist Ermine Cowles Case called it Labyrinthodonta vera or " true labyrinthodonts " . The names Stegocephalia and Labyrinthodontia were used interchangeably to refer to the order in which it belonged . The labyrinthodontian suborders Microsauria and Branchiosauria , both of which contain temnospondyls , were distinct from Labyrinthodonta . Within Labyrinthodonta were the groups Rhachitomi , Labyrinthodonti , and Embolerimi . Members of Rhachitomi , such as Archegosaurus and Eryops , had rhachitomous vertebrae with enlarged intercentra that displaced the pleurocentra . Labyrinthodonti , such as Mastodonsaurus , Trematosaurus , and Micropholis , had lost their pleurocentra , and the intercentra made up the entire body of the vertebrae . Embolerimi had intercentra and pleurocentra that were of equal size . Embolomeres are now identified as reptiliomorphs distantly related to temnospondyls . 
- In 1888 , von Zittel divided stegocephalians among three taxa : Lepospondyli , Temnospondyli , and Stereospondyli . He placed microsaurs in Lepospondyli , a group which he characterized as having simple , spool @-@ shaped vertebral centra . Temnospondyli included forms with the centra divided into pleurocentra and intercentra . All members of Stereospondyli had amphicoelous centra composed only of the intercentra . Cope objected to von Zittel 's classification , considering the vertebrae of lepospondyls and stereospondyls indistinguishable because each had a simple spool shape . He continued to use Ganocephala and Labyrinthodonta ( which he alternatively referred to as Rhachitomi ) to distinguish animals based on the absence or presence of occipital condyles . 
- Temnospondyli became a commonly used name at the turn of the century . Paleontologists included both embolomeres and rhachitomes in the group . Cope 's Ganocephala and Labyrinthodonta fell out of use . In 1919 , British paleontologist D. M. S. Watson proposed that the evolutionary history of these large amphibians could be seen through changes in their vertebrae . Embolomerous forms in the Carboniferous graded into rhachitomous forms in the Permian , and finally into stereospondyls in the Triassic . More importantly , Watson began using the term Labyrinthodontia to refer to these groups . The name Temnospondyli was rarely used in the decades that followed . Swedish paleontologist Gunnar Säve @-@ Söderbergh removed embolomeres from the group , narrowing its scope to rhachitomes and stereospondyls . His classification of labyrinthodonts was based heavily on characteristics of the skull rather than the vertebrae . 
- American paleontologist Alfred Romer brought the name Temnospondyli back into use in the later 20th century . Säve @-@ Söderbergh used the name Labyrinthodontia in a strict sense ( sensu stricto ) to refer to Rhachitomi and Stereospondyli , excluding Embolomeri . Romer agreed with this classification , but used the name Temnospondyli to avoid confusion with Labyrinthodontia in its wider sense ( sensu lato ) . Unlike modern temnospondyl classification , however , Romer included the primitive Ichthyostegalia in the group . 
- 
- = = Evolutionary history = = 
- 
- 
- = = = Carboniferous and Early Permian = = = 
- 
- Temnospondyls first appeared in the Early Carboniferous around 330 million years ago ( Mya ) . During the Carboniferous , temnospondyls included basal medium @-@ sized forms such as Dendrerpeton or large semiaquatic forms such as Cochleosaurus . Other , more derived temnospondyls , such as the amphibamids , were smaller and more terrestrial . They resembled salamanders , and some taxa , such as the genus Branchiosaurus , even retained external gills like the modern @-@ day axolotl . During the latest Carboniferous and Early Permian around 300 Mya , several groups , such as the dissorophids and trematopids evolved strong , robust limbs and vertebrae and became adapted to life on land while others such as the eryopids , developed into large semiaquatic predators . The dvinosaurs , a group of small aquatic temnospondyls , evolved from terrestrial ancestors in the Late Carboniferous . 
- 
- = = = Late Permian = = = 
- 
- During the Late Permian , increasing aridity and the diversification of reptiles contributed into a decline in terrestrial temnospondyls , but semiaquatic and fully aquatic temnospondyls continued to flourish , including the large Melosaurus of Eastern Europe . Other temnospondyls , such as archegosaurids , developed long snouts and a close similarity to crocodiles , although they lacked the armor characteristic of the latter group . These temnospondyls included the largest known amphibian , the 9 @-@ m @-@ long Prionosuchus of Brazil . 
- 
- = = = Mesozoic = = = 
- 
- As temnospondyls continued to flourish and diversify in the Late Permian ( 260 @.@ 4 - 251 @.@ 0 Mya ) , a major group called Stereospondyli became more dependent on life in the water . The vertebrae became weak , the limbs small , and the skull large and flat , with the eyes facing upwards . During the Triassic period , these animals dominated the freshwater ecosystems , evolving in a range of both small and large forms . During the Early Triassic ( 251 @.@ 0 - 245 @.@ 0 Mya ) one group of successful long @-@ snouted fish @-@ eaters , the trematosauroids , even adapted to a life in the sea , the only known amphibians to do so with the exception of the modern crab @-@ eating frog . Another group , the capitosauroids , included medium- and large @-@ sized animals 2 @.@ 3 to 4 m ( 7 @.@ 5 to 13 @.@ 1 ft ) in length , with large and flat skulls that could be over a meter long in the largest forms such as Mastodonsaurus . These animals spent most or all their lives in water as aquatic predators , catching their prey by a sudden opening of the upper jaw and sucking in fish or other small animals . 
- In the Carnian stage of the Late Triassic ( 228 @.@ 0 - 216 @.@ 5 Mya ) , capitosauroids were joined by the superficially very similar Metoposauridae . Metoposaurids are distinguished from capitosauroids by the positioning of their eye sockets near the front of their skulls . Another group of stereospondyls , the plagiosaurs , had wide heads with external gills , and adapted to life at the bottom of lakes and rivers . By this time , temnospondyls had become a common and widespread component of semiaquatic ecosystems . Some temnospondyls , such as Cryobatrachus and Kryostega , even inhabited Antarctica , which was covered in temperate forests at the time . 
- Triassic temnospondyls were often the dominant semiaquatic animals in their environments . Large assemblages of metoposaurs with hundreds of individuals preserved together have been found in the southwestern United States . They have often been interpreted as mass death events caused by droughts in floodplain environments . Recent studies show these dense assemblages were instead probably the result of currents accumulating dead individuals in certain areas . These environments seem to have had little diversity , as they were inhabited almost exclusively by metoposaurs . 
- The Triassic @-@ Jurassic extinction event around 199 @.@ 6 Mya led to the extinction of most Mesozoic temnospondyls . The brachyopoids survived , as well as a few capitosauroids and trematosauroids . While the latter two groups soon became extinct , brachyopoids persisted and grew to large sizes during the Jurassic . Among brachyopoids , the brachyopids flourished in China and the chigutisaurids became common in Gondwana . The most recent known temnospondyl was the giant chigutisaurid Koolasuchus , known from the Early Cretaceous of Australia . It survived in rift valleys that were too cold in the winter for crocodiles that normally would have competed with them . Koolasuchus was one of the largest of the brachyopoids , with an estimated weight of 500 kg ( 1 @,@ 100 lb ) . 
- 
- = = Classification = = 
- 
- Originally , temnospondyls were classified according to the structure of their vertebrae . Early forms , with complex vertebrae consisting of a number of separate elements , were placed in the suborder Rachitomi , and large Triassic aquatic forms with simpler vertebrae were placed in the suborder Stereospondyli . With the recent growth of phylogenetics , this classification is no longer viable . The basic rhachitomous condition is found in many primitive tetrapods , and is not unique to one group of temnospondyls . Moreover , the distinction between rhachitomous and stereospondylous vertebrae is not entirely clear . Some temnospondyls have rhachitomous , semirhachitomous , and sterospondylous vertebrae at different points in the same vertebral column . Other taxa have intermediate morphologies that do not fit into any category . Rachitomi is no longer recognized as a group , but Stereospondyli is still considered valid . Below is a simplified taxonomy of temnospondyls showing currently recognized groups : 
- Class Amphibia 
- Order Temnospondyli 
- Superfamily Edopoidea 
- Family Cochleosauridae ( Chenoprosopidae ) 
- Family Edopidae 
- Family Dendrerpetontidae 
- Suborder Euskelia 
- Superfamily Dissorophoidea 
- Family Amphibamidae 
- Family Branchiosauridae 
- Family Dissorophidae 
- Family Micromelerpetontidae 
- Superfamily Eryopoidea 
- Family Eryopidae 
- Family Parioxyidae 
- Family Zatrachydidae 
- Clade Limnarchia 
- Clade Stereospondylomorpha 
- Superfamily Archegosauroidea 
- Family Actinodontidae 
- Family Archegosauridae 
- Family Intasuchidae ( placement is uncertain ) 
- Family Sclerocephalidae 
- Suborder Stereospondyli 
- Family Peltobatrachidae 
- Family Lapillopsidae 
- Family Rhinesuchidae 
- Family Lydekkerinidae 
- Clade Capitosauria 
- Superfamily Mastodonsauroidea ( Capitosauroidea ) 
- Family Heylerosauridae 
- Family Mastodonsauridae 
- Family Stenotosauridae 
- Infraorder Trematosauria 
- Superfamily Trematosauroidea 
- Superfamily Metoposauroidea 
- Superfamily Plagiosauroidea 
- Superfamily Brachyopoidea 
- Superfamily Rhytidosteoidea 
- 
- = = = Phylogeny = = = 
- 
- In one of the earliest phylogenetic analyses of the group , Gardiner ( 1983 ) recognized five characteristics that made Temnospondyli a clade : a bone at the back of the skull , the parasphenoid , is connected to another bone on the underside of the skull , the pterygoid ; large openings called interpterygoid vacuities are present between the pterygoids ; the stapes ( a bone involved in hearing ) is connected to the parasphenoid and projects upward ; the cleithrum , a bone in the pectoral girdle , is thin ; and part of the vertebra called the interdorsal attaches to the neural arch . Additional features were given by Godfrey et al . ( 1987 ) , including the contact between the postparietal and exoccipital at the back of the skull , small projections ( uncinate processes ) on the ribs , and a pelvic girdle with each side having a single iliac blade . These shared characteristics are called synapomorphies . 
- Temnospondyls are placed as basal tetrapods in phylogenetic analyses , with their exact positioning varying between studies . Depending on the classification of modern amphibians , they are either included in the crown group Tetrapoda or the stem of Tetrapoda . Crown @-@ group tetrapods are descendants of the most recent common ancestor of all living tetrapods and stem tetrapods are forms that are outside the crown group . Modern amphibians have recently been suggested as descendants of temnospondyls , which would place them within crown Tetrapoda . Below is a cladogram from Ruta et al . ( 2003 ) placing Temnospondyli within crown Tetrapoda : 
- Other studies place modern amphibians as the descendants of lepospondyls and place temnospondyls in a more basal position within the stem of Tetrapoda . Below is a cladogram from Laurin and Reisz ( 1999 ) placing Temnospondyli outside crown Tetrapoda : 
- Most phylogenetic analyses of temnospondyl interrelationships focus on individual families . One of the first broad @-@ scale studies of temnospondyl phylogeny was conducted by paleontologist Andrew Milner in 1990 . A 2007 study made a " supertree " of all temnospondyl families , combining the family @-@ level trees of previous studies . The following cladogram is modified from Ruta et al . ( 2007 ) : 
- 1 Temnospondyli , 2 Edopoidea , 3 Dvinosauria , 4 Euskelia , 5 Eryopoidea , 6 Dissorophoidea , 7 Limnarchia , 8 Archegosauroidea , 9 Stereospondyli , 10 Rhytidostea , 11 Brachyopoidea , 12 Capitosauria , 13 Trematosauria , 14 Metoposauroidea 
- The most basal group of temnospondyls is the superfamily Edopoidea . Edopoids have several primitive or plesiomorphic features , including a single occipital condyle and a bone called the intertemporal that is absent in other temnospondyls . Edopoids include the Late Carboniferous genus Edops and the family Cochleosauridae . Dendrerpetontidae has also been included in Edopoidea , and is the oldest known temnospondyl family . Balanerpeton woodi is the oldest species , having been present over 330 million years ago during the Viséan stage of the Early Carboniferous . Recent analyses place Dendrerpetontidae outside Edopoidea in a more derived position . Other primitive temnospondyls include Capetus and Iberospondylus . Saharastega and Nigerpeton , both described in 2005 from Niger , are also primitive yet come from the Late Permian . They are almost 40 million years younger than other basal temnospondyls , implying a long ghost lineage of species that are not yet known in the fossil record . 
- In 2000 , paleontologists Adam Yates and Anne Warren produced a revised phylogeny of more derived temnospondyls , naming several new clades . Two major clades were Euskelia and Limnarchia . Euskelia includes the temnospondyls that were once called rhachitomes and includes two subfamilies , the Dissorophoidea and the Eryopoidea . Dissorophoids include small , mostly terrestrial temnospondyls that may be the ancestors of modern amphibians . Eryopoids include larger temnospondyls like Eryops . The second major clade , Limnarchia , includes most Mesozoic temnospondyls , as well as some Permian groups . Within Limnarchia are the superfamily Archegosauroidea and the most derived temnospondyls , the stereospondyls . 
- Yates and Warren also named Dvinosauria , a clade of small aquatic temnospondyls from the Carboniferous , Permian , and Triassic . They placed Dvinosauria within Limnarchia , but more recent studies disagree on their position . For example , a 2007 study places them even more basal than euskelians , while a 2008 study keeps them as basal limnarchians . 
- Within Stereospondyli , Yates and Warren erected two major clades : Capitosauria and Trematosauria . Capitosaurs include large semiaquatic temnospondyls like Mastodonsaurus with flat heads and eyes near the back of the skull . Trematosaurs include a diversity of temnospondyls , including large marine trematosauroids , aquatic plagiosaurs , brachyopoids that survived into the Cretaceous , and metoposauroids with eyes near the front of their heads . In 2000 , paleontologists Rainer Schoch and Andrew Milner named a third major clade of stereospondyls , the Rhytidostea . This group included more primitive stereospondyls that could not be placed in either Capitosauria or Trematosauria , and included groups like Lydekkerinidae , Rhytidosteidae , and Brachyopoidea . While Capitosauria and Trematosauria are still widely used , Rhytidostea is not often supported as a true clade in recent analyses . Rhytidosteids and brachyopoids are now grouped with trematosaurians , but lydekkerinids are still considered to be a primitive family of stereospondyls . 
- A new phylogeny of temnospondyls was offered by paleontologist Rainer Schoch in 2013 . It supported many of the clades that were found by Yates and Warren , but it did not find support for their division of derived stereospondyls into Euskelia and Limnarchia . Eryopids were found to be more closely related to stereospondyls than to dissorophoids , which were grouped with dvinosaurs . The clade including Eryopidae and Stereospondylomorpha was named Eryopiformes . In addition , Schoch named the clade containing all temnospondyls except edopoids Eutemnospondyli and reinstated the name Rhachitomi for the clade containing all temnospondyls except edopoids and dendrerpetontids . Below is the cladogram from Schoch 's analysis : 
- 
- = = = Relationship to modern amphibians = = = 
- 
- Modern amphibians ( frogs , salamanders , and caecilians ) are classified in Lissamphibia . Lissamphibians appear to have arisen in the Permian . Molecular clock estimates place the first lissamphibian in the Late Carboniferous , but the first member of Batrachia ( frogs and salamanders , but not caecilians ) is estimated to have appeared in the Middle Permian using the same technique . Using fossil evidence , there are three main theories for the origin of modern amphibians . 
- One is that they evolved from dissorophoid temnospondyls . Another is that they evolved from lepospondyls , most likely the lysorophians . A third hypothesis is that caecilians descended from lepospondyls and frogs and salamanders evolved from dissorophoids . 
- Recently , the theory that temnospondyls were the ancestors of all lissamphibians has gained wide support . The skull morphology of some small temnospondyls has been compared to those of modern frogs and salamanders , but the presence of bicuspid , pedicellate teeth in small , paedomorphic or immature temnospondyls has been cited as the most convincing argument in favor of the temnospondyl origin of lissamphibians . Seen in lissamphibians and many dissorophoid temnospondyls , pedicellate teeth have calcified tips and bases . During the development of most tetrapods , teeth begin to calcify at their tips . Calcification normally proceeds downward to the base of the tooth , but calcification from the tip stops abruptly in pedicellate teeth . Calcification resumes at the base , leaving an area in the center of the tooth uncalcified . This pattern is seen in living amphibians and fossils . 
- The dissorophoid family Amphibamidae is thought to be most closely related to Lissamphibia . In 2008 , an amphibamid called Gerobatrachus hottoni was named from Texas and was nicknamed the " frogamander " for its frog @-@ like head and salamander @-@ like body . It was thought to be the most closely related temnospondyl to lissamphibians and was placed as the sister taxon of the group in a phylogenetic analysis . Another species of amphibamid called Doleserpeton annectens is now thought to be even more closely related to lissamphibians . Unlike Gerobatrachus , Doleserpeton was known since 1969 , and the presence of pedicellate teeth in its jaws has led some paleontologists to conclude soon after its naming that it was a relative of modern amphibians . It was first described as a " protolissamphibian " , and the specific name annectens means " connecting " in reference to its inferred transitional position between temnospondyls and lissamphibians . The structure of its tympanum , a disk @-@ like membrane that functions like an ear drum , is similar to that of frogs and has also been used as evidence for a close relationship . Other features including the shape of the palate and the back of the skull , the short ribs , and the smooth skull surface also point to it being a closer relative of lissamphibians than is Gerobatrachus . Below is a cladogram modified from Sigurdsen and Bolt ( 2010 ) showing the relationships of Gerobatrachus , Doleserpeton , and Lissamphibia : 
- 
- = = Paleobiology = = 
- 
- 
- = = = Feeding = = = 
- 
- Although the earliest temnospondyls were primarily semiaquatic , they had the ability to feed on land . Later , eryopoids and dissorophoids , some well adapted to terrestrial life , also fed on land . Some eryopoids became better adapted toward life in water , and shifted their diets toward aquatic organisms . The first primarily aquatic feeders were archegosaurs in the Permian . Trematosaurs and capitosaurs became independently aquatic and also returned to this type of feeding . 
- Most aquatic stereospondyls have flattened heads . When feeding , they probably opened their mouths by lifting their skulls instead of lowering their lower jaws . The jaw mechanics of the plagiosaurid Gerrothorax is well known , and is one of the most highly adapted . Gerrothorax is thought to have lifted its skull to around 50 ° above horizontal through the flexing of the atlanto @-@ occipital joint between the occipital condyles of the skull and the atlas vertebra of the neck . As the skull is raised , the quadrate bone pushes forward and causes the lower jaw to protrude outward . Other stereospondyls probably also lifted their skulls , but they are not as well adapted for such movement . D.M.S. Watson was the first to suggest skull lifting as a means of feeding in temnospondyls . He envisioned that Mastodonsaurus , a much larger temnospondyl than Gerrothorax , was able to make the same movement . Paleontologist A.L. Panchen also supported the idea in 1959 , suggesting that Batrachosuchus also fed in this way . At the time it was thought that these temnospondyls lifted their heads with strong jaw muscles , but it is now thought that they used larger muscles in the neck that were attached to the large pectoral girdle . Plagiosuchus , a close relative of Gerrothorax , also has a hyobranchial skeleton that muscles may have attached to . Plagiosuchus has very small teeth and a large area for muscle attachment behind the skull , suggesting that it could suction feed by rapidly opening its mouth . 
- Unlike semiaquatic temnospondyls , terrestrial temnospondyls have skulls that are adapted for biting land @-@ living prey . The sutures between the bones of the skull in the dissorophoid Phonerpeton are able to withstand a high degree of compression . Compressive forces would have been experienced when biting down on prey . Earlier aquatic tetrapods and tetrapod ancestors differ from temnospondyls like Phonerpeton in that their skulls were also built to withstand tension . This tension would have been experienced during suction feeding underwater . Temnospondyls like Phonerpeton were among the first tetrapods that were almost exclusively terrestrial and fed by biting . 
- 
- = = = Reproduction = = = 
- 
- Temnospondyls , like all amphibians , reproduced in aquatic environments . Most temnospondyls probably reproduced through external fertilization . Like most living frogs , female temnospondyls would have laid masses of eggs in water while males released sperm to fertilize them . Several fossils were described from the Early Permian of Texas in 1998 that may be egg masses of dissorophoid temnospondyls . They were the first known fossils of amphibian eggs . The fossils consist of small disks with thin membranes that are probably vitelline membranes and halo @-@ like areas surrounding them that are most likely mucous coatings . They are attached to plant fossils , suggesting that these temnospondyls laid eggs on aquatic plants much like modern frogs . The mucous membranes show that the eggs were laid by amphibians , not fish ( their eggs lack mucous ) , but the type of amphibian that laid them cannot be known because no body fossils are preserved with the eggs . The eggs are thought to be from dissorophoids because they are likely to be close relatives of modern amphibians , and probably had similar reproductive strategies . They are also the most common amphibians from the deposit in which the eggs were found . 
- One temnospondyl , the dvinosaur Trimerorhachis , may have brooded young in an area between the gills called the pharyngeal pouch . Small bones belonging to younger Trimerorhachis individuals have been found in these pouches . The living Darwin 's Frog is also a mouth brooder and would be the closest modern analogue to Trimerorhachis if it cared for its young in this way . An alternative possibility is that Trimerorhachis was cannibalistic , eating its young like many amphibians do today . If this was the case , the bones of these smaller individuals were originally located in the throat and were pushed into the pharyngeal pouch as the animal fossilized . 
- Body impressions of Early Carboniferous temnospondyls from Pennsylvania suggest that some terrestrial temnospondyls mated on land like some modern amphibians . They reproduced through internal fertilization rather than mating in water . The presence of three individuals in one block of sandstone shows that the temnospondyls were gregarious . The head of one individual rests under the tail of another in what may be a courtship display . Internal fertilization and similar courtship behavior are seen in modern salamanders . 
- 
- = = = Growth = = = 
- 
- While most types of temnospondyls are distinguished on the basis of features in mature specimens , several are known from juvenile and larval specimens . Metamorphosis is seen in dissorophoids , eryopids , and zatrachydids , with aquatic larvae developing into adults capable of living on land . Several types of dissorophoids do not fully metamorphose , but retain features of juveniles such as gills and small body size in what is known as neoteny . Dvinosaurians and the plagiosaurid Gerrothorax were also neotenic because they retained gills , but they are only known from adult specimens . 
- Temnospondyl larvae are often distinguished by poorly developed bones and the presence of a hyobranchial apparatus , a series of bones that gills would attach to in life . However , some fully mature temnospondyls also possess hyobranchial bones but did not have external gills . A dense covering of scales is also seen in larvae and adults . Major body changes occur in metamorphosis , including the reshaping and strengthening of skull bones , the thickening of postcranial bones , and an increase in body size . 
- Temnospondyls like Sclerocephalus are known from both large adult specimens and small larvae , showing an extreme change in body shape . In these species , the shape and proportions of skull bones change in the early stages of development . The ornamentation on the surface of the skull roof also develops at this time . Small , regularly spaced pits are the first to form , followed by larger ridges . As development continues , the external gills disappear . Small teeth that once covered the palate are lost . The postcranial skeleton does not develop at the same rate as the skull , with ossification ( the replacement of cartilage by bone ) happening more slowly . Vertebrae and limb bones are poorly developed , ribs and fingers are absent in the early stages , and the scapulocoracoid and ischium are entirely absent through most of development . Once maturity is reached , most bones have fully formed and growth rate slows . The bones of some temnospondyls like Dutuitosaurus show growth marks , possibly an indication that growth rate varied with the change in seasons . Fossils of temnospondyls like Metoposaurus and Cheliderpeton show that individuals grew larger past maturity . The oldest individuals usually have more pitting on their skulls with deeper sulci . 
- One group of temnospondyls , the Branchiosauridae , is also known from larval specimens . Branchiosaurids like Branchiosaurus and Apateon are represented by many fossils preserving skin and external gills . An entire growth series is exhibited in the wide range of sizes among specimens , but the lack of terrestrially adapted adult forms suggests that these temnospondyls were neotenic . Unlike other temnospondyls , their postcranial skeletons developed quickly but were still partly cartilaginous when fully mature . Adults likely had an aquatic lifestyle similar to juveniles . Recently , large specimens of Apateon gracilis were described with adaptations toward a terrestrial lifestyle , indicating that not all branchiosaurs were neotenic . 
- While most temnospondyls are aquatic in early stages of life , most metoposaurids appear to have been terrestrial in their juvenile stage . Like other Mesozoic temnospondyls , adult metoposaurids were adapted to a semiaquatic lifestyle . Their bones are not highly developed for movement on land . The cross @-@ sectional thickness of limb bones in adult metoposaurids shows that they could not withstand the stress of terrestrial locomotion . Juvenile individuals have bones that are thick enough to withstand this stress , and could probably move about on land . To maintain a terrestrial lifestyle , a temnospondyl 's limb bones would have to thicken with positive allometry , meaning that they would grow at a greater rate than the rest of the body . This is not the case in metoposaurids , meaning that as their bodies grew larger they became less adapted toward a terrestrial lifestyle . 
- 
- = = = Hearing = = = 
- 
- Temnospondyls and other early tetrapods have rounded otic notches in the back of the skull that project into the cheek region . In life , the otic notch would have been covered by a membrane called the tympanum , which is seen as a disk @-@ like area in living frogs . The tympanum is involved in hearing , and is similar to the ear drum of more advanced tetrapods . It was traditionally thought that the tympanum developed very early in tetrapod evolution as a hearing organ and progressed to form the ear drum of amniotes . Thus , temnospondyls possessed a hearing system supposedly ancestral to that of living amphibians and reptiles . 
- Frogs and all other living tetrapods have a rod @-@ like bone called the stapes that aids in hearing by transferring vibrations from the ear drum — or homologous tympanum — to the inner ear . Temnospondyls also have a stapes , which projects into the otic cavity . The stapes likely evolved from the hyomandibula of lobe @-@ finned fishes . The positioning of the stapes and the shape of the otic region suggests that the tympani of temnospondyls and frogs are homologous , but the tympani of these amphibians are no longer considered homologous with the hearing systems of reptiles , birds , and mammals . Therefore , ear structures in temnospondyls were not ancestral to those of all other tetrapods . 
- The ability of the tympanum and stapes to effectively transmit vibrations is called impedance matching . Early tetrapods like temnospondyls have thick stapes with poor impedance matching , so it is now thought that they were not used for hearing . Instead , these thick stapes may have functioned to support the tissue that covers the otic notch . Early temnospondyls like Dendrerpeton could not hear airborne sound but would have been able to detect vibration in the ground . Later temnospondyls like Doleserpeton had otic regions adapted to hearing . Doleserpeton has a structure in the inner ear called the perilymphatic duct , which is also seen in frogs and is associated with hearing . Its stapes is also a better transmitter of sound . The hearing system of Doleserpeton and related temnospondyls was able to detect airborne sound and may have been ancestral to that of living amphibians . 
- 
- 
- = Osbert de Bayeux = 
- 
- Osbert de Bayeux ( floruit 1121 to 1184 ) was a medieval English cleric and archdeacon in the Diocese of York . A relative of Thurstan , the Archbishop of York , Osbert probably owed his ecclesiastical positions to this relative . After Thurstan 's death , Osbert was opposed to one of the candidates for the archbishopric , William fitzHerbert , and worked to secure fitzHerbert 's deposition and replacement by Henry Murdac . After Murdac 's death in 1153 , Osbert tried to prevent the return of fitzHerbert , but these attempts were unsuccessful . When fitzHerbert died suddenly in 1154 , Osbert was accused of murdering the newly returned archbishop . Although he was never convicted of the murder in either a secular or an ecclesiastical court , he was stripped of his clerical status and became a layman before 1158 . He died after 1184 , perhaps even after 1194 . 
- 
- = = Early life = = 
- 
- Osbert was first mentioned in the historical record between 1121 and 1128 when he appears in a charter , which although likely a forgery , probably contains an authentic witness list . This document lists him as " Osbert archdeacon " , which means that he probably held the archdeaconry of Richmond . He was the nephew of Thurstan , who was Archbishop of York from 1114 to 1140 . Presumably he owed his position as archdeacon to his uncle and was probably appointed at a young age . A charter of Thurstan 's , dating to around 1138 , names Osbert explicitly as Thurstan 's nephew . 
- 
- = = Opposition to William fitzHerbert = = 
- 
- Osbert was opposed to the election of William fitzHerbert as Archbishop of York and supported William 's rival and successor Henry Murdac . Although he remained a supporter of Murdac after 1147 , he did oppose Murdac 's interventions in Selby Abbey , where Murdac had deposed one abbot and appointed another . In 1153 , Osbert deposed Murdac 's choice as abbot of Selby and appointed another abbot . Originally , Osbert had supported Elias Paynel , Murdac 's choice for abbot , but then changed his stance and helped with the deposition . 
- After Murdac 's death in 1153 , Osbert was opposed to William 's return as archbishop , but was unsuccessful in his attempts to prevent William 's reappointment . William died a week after his return to York , however , and Osbert , along with Robert of Ghent , the Dean of York , secured the quick election of the new archbishop , Roger de Pont L 'Évêque . 
- 
- = = Poisoning accusations = = 
- 
- Osbert was accused of murdering William , specifically by poisoning him through the communion chalice . A fellow cleric , Symphorian , who had been a chaplain of the deceased archbishop , brought murder charges against Osbert . Symphorian obtained a hearing on the charges at a royal council presided over by King Stephen of England at Michaelmas in 1154 , but Stephen 's subsequent death prevented a resolution . Osbert attempted to have the trial switched to an ecclesiastical court and was supported in his efforts by Archbishop Theobald of Canterbury . A trial was finally held in 1156 and Osbert 's accuser did not produce any witnesses , but Osbert was unable to prove his innocence , prompting the transfer of the case to a papal court . No record of any judgment exists , but Osbert apparently appeared before two popes , Adrian IV and Alexander III . A further appeal to the papal court was referred to papal judges @-@ delegate between 1175 and 1180 . 
- The case attracted commentary by two contemporary writers . John of Salisbury , who was a secretary for Theobald , added information about Osbert in a letter to Alexander III on unrelated business . In the section of the letter , John pointed out to the pope that no matter what others might say about Osbert , he had failed to secure other clergy willing to swear that he was innocent . Another contemporary , Gilbert Foliot , who was Bishop of Hereford , wrote to the pope to remind him that although Osbert 's accuser had offered to prove his accusations by undergoing a trial by ordeal , this was essentially meaningless since canon law forbade the clergy from the ordeal . 
- 
- = = Later life and death = = 
- 
- Osbert was no longer archdeacon by 1158 , as his successor is attested by that point . Osbert , however , continued to call himself " archdeacon " even though he held land as a secular lord , including lands in Lacy and Skipton . He also acted as a steward for Hugh de Tilly . Osbert was still alive in 1184 , as he was a witness to a document at York then , and may have been alive as late as 1194 , when Hugh Bardulf was responsible for the farm of Osbert 's lands , as the record of that transaction in the escheat roll is unclear if Osbert was alive at that time or dead . 
- Osbert had two sons , William de Bayeux and Turstin de Baius . Osbert was a benefactor to a number of monasteries , including Drax Priory , Pontefract Priory and Gisborough Priory . He also gave land to a hospital in York and to the Templars and Hospitallers . 
- 
- 
- = Dvorak technique = 
- 
- The Dvorak technique ( developed between 1969 and 1984 by Vernon Dvorak ) is a widely used system to estimate tropical cyclone intensity ( which includes tropical depression , tropical storm , and hurricane / typhoon / intense tropical cyclone intensities ) based solely on visible and infrared satellite images . Within the Dvorak satellite strength estimate for tropical cyclones , there are several visual patterns that a cyclone may take on which define the upper and lower bounds on its intensity . The primary patterns used are curved band pattern ( T1.0 @-@ T4.5 ) , shear pattern ( T1.5 @-@ T3.5 ) , central dense overcast ( CDO ) pattern ( T2.5 @-@ T5.0 ) , central cold cover ( CCC ) pattern , banding eye pattern ( T4.0 @-@ T4.5 ) , and eye pattern ( T4.5 - T8.0 ) . 
- Both the central dense overcast and embedded eye pattern use the size of the CDO . The CDO pattern intensities start at T2.5 , equivalent to minimal tropical storm intensity ( 40 mph , 65 km / h ) . The shape of the central dense overcast is also considered . The eye pattern utilizes the coldness of the cloud tops within the surrounding mass of thunderstorms and contrasts it with the temperature within the eye itself . The larger the temperature difference is , the stronger the tropical cyclone . Once a pattern is identified , the storm features ( such as length and curvature of banding features ) are further analyzed to arrive at a particular T @-@ number . The CCC pattern indicates little development is occurring , despite the cold cloud tops associated with the quickly evolving feature . 
- Several agencies issue Dvorak intensity numbers for tropical cyclones and their precursors , including the National Hurricane Center 's Tropical Analysis and Forecast Branch ( TAFB ) , the NOAA / NESDIS Satellite Analysis Branch ( SAB ) , and the Joint Typhoon Warning Center at the Naval Meteorology and Oceanography Command in Pearl Harbor , Hawaii . 
- 
- = = Evolution of the method = = 
- 
- The initial development of this technique occurred in 1969 by Vernon Dvorak , using satellite pictures of tropical cyclones within the northwest Pacific ocean . The system as it was initially conceived involved pattern matching of cloud features with a development and decay model . As the technique matured through the 1970s and 1980s , measurement of cloud features became dominant in defining tropical cyclone intensity and central pressure of the tropical cyclone 's low @-@ pressure area . Use of infrared satellite imagery led to a more objective assessment of the strength of tropical cyclones with eyes , using the cloud top temperatures within the eyewall and contrasting them with the warm temperatures within the eye itself . Constraints on short term intensity change are used less frequently than they were back in the 1970s and 1980s . The central pressures assigned to tropical cyclones have required modification , as the original estimates were 5 @-@ 10 hPa ( 0 @.@ 15 @-@ 0 @.@ 29 inHg ) too low in the Atlantic and up to 20 hPa ( 0 @.@ 59 inHg ) too high in the northwest Pacific . This led to the development of a separate wind @-@ pressure relationship for the northwest Pacific , devised by Atkinson and Holliday in 1975 , then modified in 1977 . 
- As human analysts using the technique lead to subjective biases , efforts have been made to make more objective estimates using computer programs , which have been aided by higher @-@ resolution satellite imagery and more powerful computers . Since tropical cyclone satellite patterns can fluctuate over time , automated techniques use a six @-@ hour averaging period to lead to more reliable intensity estimates . Development of the objective Dvorak technique began in 1998 , which performed best with tropical cyclones that had eyes ( of hurricane or typhoon strength ) . It still required a manual center placement , keeping some subjectivity within the process . By 2004 , an advanced objective Dvorak technique was developed which utilized banding features for systems below hurricane intensity and to objectively determine the tropical cyclone 's center . A central pressure bias was uncovered in 2004 relating to the slope of the tropopause and cloud top temperatures which change with latitude that helped improve central pressure estimates within the objective technique . 
- 
- = = Details of the method = = 
- 
- In a developing cyclone , the technique takes advantage of the fact that cyclones of similar intensity tend to have certain characteristic features , and as they strengthen , they tend to change in appearance in a predictable manner . The structure and organization of the tropical cyclone are tracked over 24 hours to determine if the storm has weakened , maintained its intensity , or strengthened . Various central cloud and banding features are compared with templates that show typical storm patterns and their associated intensity . If infrared satellite imagery is available for a cyclone with a visible eye pattern , then the technique utilizes the difference between the temperature of the warm eye and the surrounding cold cloud tops to determine intensity ( colder cloud tops generally indicate a more intense storm ) . In each case a " T @-@ number " ( an abbreviation for Tropical Number ) and a Current Intensity ( CI ) value are assigned to the storm . These measurements range between 1 ( minimum intensity ) and 8 ( maximum intensity ) . The T @-@ number and CI value are the same except for weakening storms , in which case the CI is higher . For weakening systems , the CI is held as the tropical cyclone intensity for 12 hours , though research from the National Hurricane Center indicates that six hours is more reasonable . The table at right shows the approximate surface wind speed and sea level pressure that corresponds to a given T @-@ number . The amount a tropical cyclone can change in strength per 24 ‑ hour period is limited to 2 @.@ 5 T @-@ numbers per day . 
- 
- = = = Pattern types = = = 
- 
- Within the Dvorak satellite strength estimate for tropical cyclones , there are several visual patterns that a cyclone may take on which define the upper and lower bounds on its intensity . The primary patterns used are curved band pattern ( T1.0 @-@ T4.5 ) , shear pattern ( T1.5 @-@ T3.5 ) , central dense overcast ( CDO ) pattern ( T2.5 @-@ T5.0 ) , banding eye pattern ( T4.0 @-@ T4.5 ) , eye pattern ( T4.5 - T8.0 ) , and central cold cover ( CCC ) pattern . Both the central dense overcast and embedded eye pattern utilize the size of the CDO . The CDO pattern intensities start at T2.5 , equivalent to minimal tropical storm intensity ( 40 miles per hour ( 64 km / h ) ) . The shape of the central dense overcast is also considered . The farther the center is tucked into the CDO , the stronger it is deemed . Tropical cyclones with maximum sustained winds between 65 miles per hour ( 105 km / h ) and 100 miles per hour ( 160 km / h ) can have their center of circulations obscured by cloudiness within visible and infrared satellite imagery , which makes diagnosis of their intensity a challenge . 
- The CCC pattern , with its large and quickly developing mass of thick cirrus clouds spreading out from an area of convection near a tropical cyclone center within a short time frame , indicates little development . When it develops , rainbands and cloud lines around the tropical cyclone weaken and the thick cloud shield obscures the circulation center . While it resembles a CDO pattern , it is rarely seen . 
- The eye pattern utilizes the coldness of the cloud tops within the surrounding mass of thunderstorms and contrasts it with the temperature within the eye itself . The larger the temperature difference is , the stronger the tropical cyclone . Winds within tropical cyclones can also be estimated by tracking features within the CDO using rapid scan geostationary satellite imagery , whose pictures are taken minutes apart rather than every half @-@ hour . 
- Once a pattern is identified , the storm features ( such as length and curvature of banding features ) are further analyzed to arrive at a particular T @-@ number . 
- 
- = = Usage = = 
- 
- Several agencies issue Dvorak intensity numbers for tropical cyclones and their precursors . These include the National Hurricane Center 's Tropical Analysis and Forecast Branch ( TAFB ) , the National Oceanic and Atmospheric Administration 's Satellite Analysis Branch ( SAB ) , and the Joint Typhoon Warning Center at the Naval Pacific Meteorology and Oceanography Center in Pearl Harbor , Hawaii . 
- The National Hurricane Center will often quote Dvorak T @-@ numbers in their tropical cyclone products . The following example is from discussion number 3 of Tropical Depression 24 ( eventually Hurricane Wilma ) of the 2005 Atlantic hurricane season : 
- BOTH TAFB AND SAB CAME IN WITH A DVORAK SATELLITE INTENSITY ESTIMATE OF T2.5 / 35 KT . HOWEVER ... OFTENTIMES THE SURFACE WIND FIELD OF LARGE DEVELOPING LOW PRESSURE SYSTEMS LIKE THIS ONE WILL LAG ABOUT 12 HOURS BEHIND THE SATELLITE SIGNATURE . THEREFORE ... THE INITIAL INTENSITY HAS ONLY BEEN INCREASED TO 30 KT . 
- Note that in this case the Dvorak T @-@ number ( in this case T2.5 ) was simply used as a guide but other factors determined how the NHC decided to set the system 's intensity . 
- The Cooperative Institute for Meteorological Satellite Studies ( CIMSS ) at the University of Wisconsin – Madison has developed the Objective Dvorak Technique ( ODT ) . This is a modified version of the Dvorak technique which uses computer algorithms rather than subjective human interpretation to arrive at a CI number . This is generally not implemented for tropical depressions or weak tropical storms . The China Meteorological Agency ( CMA ) is expected to start using the standard 1984 version of Dvorak in the near future . The Indian Meteorological Department ( IMD ) prefers using visible satellite imagery over infrared imagery due to a perceived high bias in estimates derived from infrared imagery during the early morning hours of convective maximum . The Japan Meteorological Agency ( JMA ) uses the infrared version of Dvorak over the visible imagery version . Hong Kong Observatory and JMA continue to utilize Dvorak after tropical cyclone landfall . Various centers hold on to the maximum current intensity for 6 – 12 hours , though this rule is broken when rapid weakening is obvious . 
- Citizen science site Cyclone Center uses a modified version of the Dvorak technique to categorize post @-@ 1970 tropical weather . 
- Satellite Images of Selected Tropical Storms and Associated T @-@ Number 
- 
- = = Benefits and disadvantages = = 
- 
- The most significant benefit of the use of the technique is that it has provided a more complete history of tropical cyclone intensity in areas where aircraft reconnaissance is neither possible nor routinely available . Intensity estimates of maximum sustained wind are currently within 5 miles per hour ( 8 @.@ 0 km / h ) of what aircraft are able to measure half of the time , though the assignment of intensity of systems with strengths between moderate tropical @-@ storm force ( 60 miles per hour ( 97 km / h ) ) and weak hurricane- or typhoon @-@ force ( 100 miles per hour ( 160 km / h ) ) is the least certain . Its overall precision has not always been true , as refinements in the technique led to intensity changes between 1972 and 1977 of up to 20 miles per hour ( 32 km / h ) . The method is internally consistent in that it constrains rapid increases or decreases in tropical cyclone intensity . Some tropical cyclones fluctuate in strength more than the 2 @.@ 5 T numbers per day limit allowed by the rule , which can work to the technique 's disadvantage and has led to occasional abandonment of the constraints since the 1980s . Systems with small eyes near the limb , or edge , of a satellite image can be biased too weakly using the technique , which can be resolved through use of polar @-@ orbiting satellite imagery . Subtropical cyclone intensity cannot be determined using Dvorak , which led to the development of the Hebert @-@ Poteat technique in 1975 . Cyclones undergoing extratropical transition , losing their thunderstorm activity , see their intensities underestimated using the Dvorak technique . This led to the development of the Miller and Lander extratropical transition technique which can be used under these circumstances . 
- 
- 
- = New York State Route 31B = 
- 
- New York State Route 31B ( NY 31B ) was a state highway in central New York in the United States . It served as a connector between NY 31 , its parent route , in the Cayuga County village of Weedsport and NY 5 in the Onondaga County town of Elbridge . NY 31B was assigned c . 1933 , replacing New York State Route 293 , a route assigned as part of the 1930 renumbering of state highways in New York . The NY 31B designation was removed in 1980 and replaced by County Route 31B ( CR 31B ) in Cayuga County and CR 99 in Onondaga County . 
- 
- = = Route description = = 
- 
- NY 31B began at an intersection with its parent route , NY 31 , in the Cayuga County village of Weedsport . The highway went eastward , intersecting with NY 34 less than 0 @.@ 1 miles ( 0 @.@ 2 km ) later . Much of Weedsport was urbanized , with the highway passing residential homes and businesses as it progressed eastward through the village . The highway intersected with CR 12B before leaving Weedsport and entering the town of Brutus as Brutus Road . 
- In Brutus , NY 31B continued to the east through the rural town , intersecting CR 136A and passing the Weedsport Rural Cemetery before turning to the southeast . After a short distance , the highway went through an isolated area of homes , where it intersected with CR 14 and CR 15A . NY 31B continued on , intersecting several local highways before crossing into Onondaga County and terminating at an intersection with NY 5 in the town of Elbridge . 
- 
- = = History = = 
- 
- What became NY 31B was originally designated as NY 293 as part of the 1930 renumbering of state highways in New York . NY 293 was renumbered to NY 31B c . 1933 , allowing the NY 293 designation to be transferred to another highway in Orange County . NY 31B remained unchanged until January 2 , 1980 , when the NY 31B designation was removed . 
- Ownership and maintenance of NY 31B 's former routing was gradually transferred to the counties it went through , namely Cayuga and Onondaga . The first section to be transferred was the portion within Onondaga County , which was given to the county on April 1 , 1980 , and designated as CR 99 . The Cayuga County section of former NY 31B was transferred to the county one year later on April 1 , 1981 , and designated as CR 31B for the state route that preceded it . 
- 
- = = Major intersections = = 
- 
- 
- 
- = Ben Amos = 
- 
- Benjamin Paul " Ben " Amos ( born 10 April 1990 ) is an English professional footballer who plays as a goalkeeper for Bolton Wanderers . Born in Macclesfield , Cheshire , Amos began his career with Crewe Alexandra 's youth academy , but joined Manchester United at the age of 11 . He has spent time on loan at Peterborough United , Molde , and Oldham Athletic . Additionally , Amos is an England youth international , having represented his country at every level from Under @-@ 16 to Under @-@ 21 . 
- 
- = = Club career = = 
- 
- 
- = = = Crewe Alexandra = = = 
- 
- Amos was born in Macclesfield , Cheshire and was a member of the Crewe Alexandra academy until he was released at the age of 10 . While at Crewe , he also played for another local team , Bollington United , as a centre midfielder . One year later , Amos was playing for his local team against the team at the top of the table , and they needed to win the match to win the league . Amos ' team 's goalkeeper was injured during the game , and as the tallest on the team , Amos was put in goal ; however , he had also been the team 's regular penalty taker all season , so when they were awarded a penalty , he went all the way up the pitch to take the kick . He scored , and his team went on to win the game 3 – 2 , together with the league title . After the game , his parents told him that a Manchester United scout had been watching him and that he had been invited for trials . Amos joined Manchester United at the age of 11 . 
- 
- = = = Manchester United = = = 
- 
- In his first season at Manchester United , Amos became a regular starter for the club 's Under @-@ 13 team , playing in 19 out of 27 matches in the 2001 – 02 season . Amos was named as an unused substitute for the Under @-@ 18 side for the first time on 8 January 2005 , for a league game against Manchester City . His first appearance for the Under @-@ 18s came exactly nine months later , on 8 October 2005 , coming on as a substitute for Danny Rose after starting goalkeeper Ron @-@ Robert Zieler was sent off in a 2 – 0 defeat to Bolton Wanderers . He was regularly named as an unused substitute during the 2005 – 06 season – including for two reserve team matches – but became a frequent starter for the Under @-@ 18s in 2006 – 07 after signing a trainee contract in July 2006 . However , he missed the final of the 2006 – 07 FA Youth Cup with a dislocated shoulder . 
- He retained his place in the Under @-@ 18 team for 2007 – 08 , in addition to making his debut for the reserve team against Wigan Athletic on 7 November 2007 , and during the season he impressed enough to be selected to go on the first @-@ team 's 2008 summer tour of South Africa . Amos was named as a substitute for all three matches of the tour , but did not play . En route back from South Africa , United stopped off in Nigeria to play against Portsmouth on 27 July 2008 , with Amos replacing Tomasz Kuszczak after 76 minutes . He made his competitive first @-@ team debut on 23 September 2008 in a 3 – 1 win at home to Middlesbrough in the third round of the League Cup . On 14 December 2008 , Amos travelled to Japan with the Manchester United squad for the 2008 FIFA Club World Cup , having been called up as a late replacement for Ben Foster , who had suffered a hand injury while training . 
- 
- = = = Loans to Peterborough and Molde = = = 
- 
- On 29 October 2009 , Amos signed for the Championship team Peterborough United on a month 's loan as cover for Peterborough 's suspended first @-@ choice goalkeeper Joe Lewis . He made his only appearance on 31 October in a 2 – 1 defeat against Barnsley . After returning to Manchester United , Amos was again sent out on loan in March 2010 , this time to Norwegian side Molde FK , where he remained on loan until 30 June 2010 . 
- 
- = = = Return to Manchester United = = = 
- 
- Following the departure of Ben Foster from Manchester United to Birmingham City , United manager Alex Ferguson declared that Amos would be Manchester United 's third @-@ choice goalkeeper for the 2010 – 11 season behind Edwin van der Sar and Tomasz Kuszczak . He made his first appearance of the season on 26 October 2010 , starting in goal for United 's 3 – 2 win over Wolverhampton Wanderers in the fourth round of the League Cup . 
- In United 's final Champions League group match on 7 December 2010 , Amos was picked to start against Valencia at Old Trafford . Pablo Hernández scored Valencia 's only goal past him after 32 minutes of the match – the first goal United had conceded in the Champions League that season – as the two sides played out a 1 – 1 draw . 
- 
- = = = Loan to Oldham Athletic = = = 
- 
- With the signing of Danish goalkeeper Anders Lindegaard , Manchester United allowed Amos to join Oldham Athletic on loan for the remainder of the season on 7 January 2011 , although he would continue to train with Manchester United once a week . He made his debut against Swindon Town the next day and kept a clean sheet . However , three days later , he conceded all six goals in a 6 – 0 defeat at home to Southampton ; he allowed Adam Lallana 's 20 @-@ yard shot underneath his body for the second goal , and he was rounded by Lee Barnard for the sixth . 
- On 15 March 2011 , Lindegaard was ruled out for five weeks following a knee injury , so Amos was recalled from Oldham to cover for Edwin van der Sar and Tomasz Kuszczak . 
- 
- = = = Return to Manchester United = = = 
- 
- Amos made his first start of the 2011 – 12 season in a third round League Cup tie at Elland Road against Leeds United . He kept a clean sheet as United cruised through to the next round winning 3 – 0 . He played again in the following round away at Aldershot Town , a game which United also won 3 – 0 and advanced to the quarter finals . He was in goal again for United 's League Cup game in a defeat against Championship club Crystal Palace on 30 November . It also seemed that he had moved above last season 's second choice Tomasz Kuszczak in the pecking order , but was now third choice behind Anders Lindegaard and David de Gea . He made his first Premier League start in a 2 – 0 home win against Stoke City on 31 January 2012 , keeping a clean sheet on his debut . 
- On 30 May 2012 , Amos signed a three @-@ year contract extension with Manchester United , which will keep him at the club until at least 2015 . 
- 
- = = = Loan to Hull City = = = 
- 
- On 31 July 2012 , Amos joined Championship team Hull City on a season @-@ long loan from Manchester United . Before the loan deal was completed , he had joined Hull on their pre @-@ season training camp in Portugal . He made his debut on 11 August 2012 in 7 – 6 penalty shoot @-@ out victory over Rotherham United in the first round of the 2012 – 13 Football League Cup . However , after 19 appearances , including two in the League Cup , Amos returned to Manchester United on 3 January 2013 . 
- 
- = = = Loan to Carlisle United = = = 
- 
- On 15 November 2013 , Amos joined League One team Carlisle United on a month @-@ long loan from Manchester United . 
- 
- = = = Loan to Bolton Wanderers = = = 
- 
- On 30 January 2015 he joined Championship club Bolton Wanderers on a month 's loan to provide competition for Andy Lonergan after Ádám Bogdán was ruled out with an ankle injury picked up in training . On 21 February 2015 he made his Bolton debut as a substitute for the injured Lonergan as Wanderers lost 4 – 1 to Nottingham Forest at the City Ground . He played nine Championship games for Bolton before his loan expired on 4 April . 
- 
- = = = Return to Manchester United = = = 
- 
- Ben Amos returned to Manchester United after his loan spell at Bolton Wanderers , but on 10 June 2015 , it was announced on Manchester United website that Ben Amos has been released from the club along with four other players . 
- 
- = = = Bolton Wanderers = = = 
- 
- On 1 July 2015 , Amos returned to Bolton Wanderers following his release from Manchester United , signing a four @-@ year contract with the club . 
- 
- = = International career = = 
- 
- Amos is an England youth international , having played for his country at the Under @-@ 16 , Under @-@ 17 , Under @-@ 18 , Under @-@ 19 , Under @-@ 20 and Under @-@ 21 levels . He was called up to the Under @-@ 21 squad in 2011 for their friendly match against Iceland , but he was an unused substitute . 
- 
- = = Career statistics = = 
- 
- As of 1 December 2015 
- 
- = = Honours = = 
- 
- 
- = = = Club = = = 
- 
- Manchester United 
- FIFA Club World Cup ( 1 ) : 2008 
- 
- = = Personal life = = 
- 
- Amos attended Fallibroome High School , where he earned 11 GCSEs at grade C or above . 
- 
- 
- = Clayton Kershaw = 
- 
- Clayton Edward Kershaw ( born March 19 , 1988 ) is an American professional baseball pitcher for the Los Angeles Dodgers of Major League Baseball ( MLB ) . A left @-@ handed starting pitcher , Kershaw has played in the major leagues since 2008 , and his career earned run average ( ERA ) and Walks and hits per innings pitched average ( WHIP ) are the lowest among starters in the live @-@ ball era with a minimum of 1 @,@ 000 innings pitched . With his Career Hits Allowed Per Nine Innings Pitched average ( 6 @.@ 64 ) Which is the second lowest in MLB History , a three @-@ time Cy Young Award winner , the 2014 National League Most Valuable Player and Los Angeles Dodgers All Time Leader in walks and hits per innings pitched ( 1 @.@ 01 ) and hits allowed per nine innings pitched ( 6 @.@ 64 ) , Kershaw is considered by many to be the best pitcher in MLB . 
- Kershaw was drafted seventh overall in the 2006 MLB draft . He worked his way through the Dodgers ' farm system in just one full season , and reached the majors at 20 years old . When he debuted in 2008 , he was the youngest player in MLB , a title he held for one full year . In 2011 , he won the pitching Triple Crown and the National League Cy Young Award , becoming the youngest pitcher to accomplish either of these feats since Dwight Gooden in 1985 . Kershaw pitched a no @-@ hitter on June 18 , 2014 , becoming the 22nd Dodger to do so . Being a left @-@ handed strikeout pitcher and playing for the Los Angeles Dodgers , Kershaw has often been compared to Hall of Fame pitcher Sandy Koufax . He became the first pitcher in history to lead MLB in ERA for four consecutive years when he did so in the 2011 through 2014 seasons . 
- Off the field , Kershaw is an active participant in volunteer work . He and his wife , Ellen , launched " Kershaw 's Challenge " and wrote the book Arise to raise money to build an orphanage in Zambia . He has been honored with the Roberto Clemente Award and the Branch Rickey Award for his humanitarian work . 
- 
- = = Early life = = 
- 
- Kershaw was born in Dallas , Texas on March 19 , 1988 . His parents divorced when he was 10 , and he was raised by his mother . He played in youth sports leagues as a child , including Little League Baseball . 
- Kershaw attended nearby Highland Park High School , where he played baseball and was also the center for quarterback Matthew Stafford on the football varsity . After a growth spurt and further development of his pitches , he established himself as an elite high school prospect in 2006 when he posted a 13 – 0 record with an earned run average ( ERA ) of 0 @.@ 77 , and recorded 139 strikeouts in 64 innings pitched . In a playoff game against Northwest High School of Justin , Texas , Kershaw pitched an all @-@ strikeout perfect game . He struck out all 15 batters he faced in the game , which was shortened because of the mercy rule . He also pitched for USA Baseball 's Junior National Team in the Pan Am Championship . Kershaw was selected by USA Today as " High School Baseball Player of the Year " , and was also the Gatorade National Player of the Year for baseball . 
- Entering the 2006 Major League Baseball ( MLB ) draft , Kershaw was considered the top high @-@ school pitcher available . The Los Angeles Dodgers selected Kershaw with the seventh overall pick in the draft . He had committed to Texas A & M University , but turned down the scholarship offer to sign with the Dodgers , with a bonus estimated at $ 2 @.@ 3 million . The bonus was the largest to any Dodgers draft pick at the time , and was eventually topped by Zach Lee in the 2010 draft . 
- 
- = = Professional career = = 
- 
- 
- = = = Minor Leagues = = = 
- 
- Kershaw began his career with the Gulf Coast League ( GCL ) Dodgers . He pitched in 37 innings in which he struck out 54 batters ( walking only 5 ) , while compiling a record of 2 – 0 with a 1 @.@ 95 ERA . He featured a fastball that topped out at 96 miles per hour ( 154 km / h ) and he was rated as the top prospect in the GCL , and the Dodgers ' second best prospect by Baseball America behind third baseman Andy LaRoche . 
- Kershaw was promoted to the Great Lakes Loons in 2007 , where he recorded a record of 7 – 5 with a 2 @.@ 77 ERA . He was selected to play on the East Team in the Midwest League All @-@ Star Game and on the USA team in the All @-@ Star Futures Game . On August 6 , he was promoted to the Double @-@ A Jacksonville Suns in the Southern League , where he produced a 1 – 2 record and 3 @.@ 65 ERA in five starts and was selected as the top prospect in the Dodgers organization heading into the 2008 season . 
- During spring training in a game against the Boston Red Sox , Kershaw gained much attention for throwing a curveball to Sean Casey that started behind Casey but at the end looped into the strike zone and struck him out looking . Kershaw was 0 – 3 and had a 2 @.@ 28 ERA with 47 strikeouts through 431 ⁄ 3 innings pitched in his first stint of the year with the Suns . He was then called up to the majors on May 28 , 2008 , but optioned back to Jacksonville on July 2 . 
- Kershaw pitched 18 innings during his second trip to Jacksonville ( two starts and one seven inning relief appearance ) , winning two games . During this stretch , he allowed only two runs earned runs , lowering his ERA to 1 @.@ 91 . He was recalled on July 22 . 
- 
- = = = Los Angeles Dodgers = = = 
- 
- 
- = = = = 2008 – 2010 seasons : Early career = = = = 
- 
- On May 24 , 2008 , the Dodgers bought Kershaw 's minor @-@ league contract , and he was added to the active roster . Sportswriter Tony Jackson called Kershaw 's debut the most anticipated start by a Dodgers pitcher since Hideo Nomo 's major league debut during the 1995 season . He made his debut on May 25 , starting against the St. Louis Cardinals . He struck out the first batter he faced , Skip Schumaker , the first of seven strikeouts in the game , in which he pitched six innings and allowed two runs . When he debuted , Kershaw was the youngest player in MLB , a title he held for one full year . 
- Kershaw won his first major league game against the Washington Nationals on July 27 , 2008 . He pitched six @-@ plus shutout innings , allowing four hits , a walk , and he struck out five . Kershaw finished his rookie season 5 – 5 , with a 4 @.@ 26 ERA in 22 games ( 21 starts ) . He also pitched two innings out of the bullpen for the Dodgers in the 2008 National League Championship Series ( NLCS ) against the Philadelphia Phillies . 
- On April 15 , 2009 , Kershaw pitched seven innings , striking out 13 batters while allowing only one hit ( a solo home run ) against the rival San Francisco Giants . He was the youngest Dodger to ever strikeout 13 or more batters in a game since Sandy Koufax did it in the 1955 season . On May 17 , 2009 , Kershaw did not allow a hit against the Florida Marlins through 7 innings , then gave up a lead @-@ off double to Florida 's Cody Ross . In 2009 , despite an 8 – 8 record , he led the major leagues in opposing batting average ( .200 ) , opposing slugging percentage ( .282 ) , and hits per nine innings ( 6 @.@ 26 ) . He also posted an ERA of 2 @.@ 79 and 185 strikeouts . Kershaw also walked 91 batters , which was second most in the National League ( NL ) . 
- Kershaw made his playoff starting debut against the St. Louis Cardinals in the 2009 National League Division Series ( NLDS ) . He went 62 ⁄ 3 innings , striking out 4 , walking 1 , and ended up getting a no @-@ decision ( the Dodgers went on to win the game in the 9th inning ) . At 21 years old , he started the opener of the 2009 NLCS against the Philadelphia Phillies and was the third youngest pitcher to ever start a playoff series opener , behind only Fernando Valenzuela in the 1981 NLDS and Rick Ankiel in the 2000 NLDS . 
- Kershaw started the 2010 season by posting a 3 @.@ 07 ERA in April , but did so by walking 22 batters in 29 innings . On May 4 , he had his worst start of his career against the Milwaukee Brewers at Dodger Stadium , throwing just 57 pitches in 11 ⁄ 3 innings , while retiring only four of the 13 batters he faced — including the pitcher . He was booed loudly upon being pulled from the game . Kershaw said after the game , " I didn 't give our team any kind of chance . It 's just not a good feeling to let your teammates down , let everybody down . It stings , it hurts . I 've got to figure things out . " 
- Kershaw rebounded his next start by pitching an 8 inning two @-@ hitter and out @-@ dueling the then undefeated Ubaldo Jiménez . He credited his control of the slider being the major turning point for him . Later in the season , he was suspended for five games after hitting Aaron Rowand of the Giants with a pitch in a game on July 20 . The incident occurred after both teams were given a warning following Giants ace Tim Lincecum hitting Matt Kemp earlier in the game . He threw his first career complete game shutout on September 14 , 2010 also against San Francisco and finished the season with a record of 13 – 10 and a 2 @.@ 91 ERA in 32 starts , pitching 2041 ⁄ 3 innings and recording 212 strikeouts . 
- 
- = = = = 2011 season : 1st Cy Young Award = = = = 
- 
- After finishing the 2010 season strong , the Dodgers named Kershaw as the Opening Day Starter for the 2011 season . On May 29 , he pitched the second complete @-@ game shutout of his career , striking out 10 while winning a two @-@ hitter against the Florida Marlins , 8 – 0 ; he also had two singles and an RBI , scoring twice in the game . He produced his third career shutout on June 20 , a two @-@ hit , 11 @-@ strikeout effort against the Detroit Tigers . Kershaw became the first Dodgers starter to strike out the side in the 9th inning since Sandy Koufax 's perfect game . In his next start , on June 26 , Kershaw pitched another complete game ( against the Los Angeles Angels of Anaheim ) . He became the first Dodger starter to have back @-@ to @-@ back complete game victories since Jeff Weaver in the 2005 season and the first Dodger to have double @-@ digit strikeouts in consecutive starts since Chan @-@ Ho Park in the 2000 season . He was awarded the National League Player of the Week award for the week of June 20 – 26 as a result of those two starts . Midway through June , Kershaw had amassed 32 career victories , a 3 @.@ 15 ERA and 593 career strikeouts in 568 @.@ 2 innings . According to the Elias Sports Bureau , Kershaw was the first 23 @-@ year @-@ old pitcher to have that many victories , an ERA that low and an average of more than one strikeout per inning since ERA became an official statistic in 1910 . 
- Kershaw was selected to the National League team for the 2011 Major League Baseball All @-@ Star Game , his first All @-@ Star selection . In the month of July , Kershaw was 4 – 1 with a 2 @.@ 02 ERA and NL @-@ leading 45 strikeouts , earning him the National League Pitcher of the Month Award . On August 23 , he struck out Matt Holliday of the St. Louis Cardinals for his 200th strikeout of the season and became the 10th Dodger pitcher to record back @-@ to @-@ back 200 strikeout seasons and the first since Chan @-@ Ho Park did it in the 2001 season . 
- Kershaw finished the 2011 season by leading the NL with 21 wins , 248 strikeouts and a 2 @.@ 28 ERA , winning the NL pitching Triple Crown , the first Triple Crown winner since Jake Peavy of the 2007 San Diego Padres and the first Dodger since Sandy Koufax won it in the 1966 season . Justin Verlander of the Detroit Tigers won the American League Triple Crown the same season , marking the first major @-@ league season since 1924 to feature Triple Crown @-@ winning pitchers in both leagues . Kershaw 's 21 wins were the most by a Dodger pitcher since Orel Hershiser won 23 during the 1988 season . His ERA was the lowest by a Dodger since Hershiser 's 2 @.@ 03 in the 1985 season , his strikeouts were the most by a Dodger since Koufax 's 317 in 1966 and his 233 1 ⁄ 3 innings pitched were the most since Chan Ho Park pitched 234 in 2001 . Since 1965 when Koufax did it , Peavy and Kershaw are only two pitchers in the National League have led the league in wins , strikeouts , ERA , and WHIP ( walks plus hits per inning pitched ) . Kershaw also became just the second lefthander to have a 240 @-@ plus strikeouts in a season before the age of 24 , joining Vida Blue . 
- After the season , Kershaw was awarded the Warren Spahn Award as the best left @-@ handed pitcher in 2011 , the Players Choice Award for Most Outstanding National League pitcher , the Gold Glove Award as the top fielding pitcher in the NL and the Sporting News ( TSN ) National League Pitcher of the Year . He was additionally selected as the starting pitcher for the TSN NL All @-@ Star Team . On November 17 , he was honored with the National League Cy Young Award , making him the youngest Cy Young winner since Dwight Gooden of the 1985 New York Mets . He was the 8th Dodger pitcher to win the award , the first since Éric Gagné in the 2003 season . 
- 
- = = = = 2012 season : Cy Young runner @-@ up = = = = 
- 
- On February 7 , 2012 , Kershaw and the Dodgers agreed on a two @-@ year , $ 19 million contract . The contract was the second highest for a player in his first year of arbitration ( after Tim Lincecum 's $ 23 million 2 @-@ year contract in 2010 ) . 
- Kershaw was the Dodgers ' Opening Day starter for the second year in a row , where he pitched three innings of shutout ball against the San Diego Padres at Petco Park before being removed from the game due to flu @-@ like symptoms . On April 27 , he was able to last through eight innings for his second win of the season against the Washington Nationals . The win was also his 12th straight home win , tying him with Ed Roebuck ( June 1960 – August 1962 ) and Orel Hershiser ( September 1984 – October 1985 ) for the longest home winning streak since the Dodgers moved to Los Angeles . Kershaw won the National League 's Player of the Week Award for the week of May 14 – 20 after he made two starts during that week and pitched 16 scoreless innings , including his fourth career shutout . Kershaw was selected to appear in the 2012 Major League Baseball All @-@ Star Game , the second straight year he made the team . On August 11 , he went over 200 innings on the season , becoming the 12th Los Angeles Dodger pitcher with three or more seasons of 200 or more innings , and the first since Hershiser did it five times from 1985 to 1989 . Kershaw also became just the fifth Dodger pitcher with three straight 200 strikeout seasons . 
- Kershaw finished 2012 with a 14 – 9 record , a 2 @.@ 53 ERA ( leading the league ) , 229 strikeouts , and 2272 ⁄ 3 innings pitched , coming second in both categories . He became the first pitcher to lead the league in ERA in consecutive seasons since Arizona 's Randy Johnson in 2001 – 02 . This was also marked his fourth year in a row with a sub @-@ 3 @.@ 00 ERA , making him the first to do this since Randy Johnson from 1999 – 2002 . He finished second for the NL Cy Young behind R. A. Dickey , receiving two first place votes . 
- 
- = = = = 2013 season : 2nd Cy Young Award = = = = 
- 
- Kershaw made his third straight opening day start for the Dodgers in the 2013 season , the first Dodger starter to do so since Derek Lowe ( 2005 – 2007 ) . In that opening day start he pitched a complete game , four hit , shutout over the Giants and also hit his first career home run . He was the first pitcher to throw a shutout and hit a home run on opening day since Bob Lemon of the Cleveland Indians did so against the Chicago White Sox on April 14 , 1953 . Kershaw picked up his 1,000th career strikeout on April 17 , 2013 , when he struck out Yonder Alonso of the Padres . He was the second youngest Dodger to reach that mark , behind only Fernando Valenzuela . On May 14 , Kershaw passed the 1 @,@ 000 inning mark for his career . His ERA of 2 @.@ 70 at the time was the fifth best of the live @-@ ball era at the 1 @,@ 000 inning mark and the best career mark . He also threw 130 pitches that day , the most of his career and the most by a Dodger pitcher since Odalis Pérez in the 2003 season . 
- Kershaw was selected to the 2013 Major League Baseball All @-@ Star Game , his third straight selection . In July , he compiled a 4 – 1 record and 1 @.@ 34 ERA in six starts and was awarded his second National League Pitcher of the Month Award . On September 2 , Kershaw picked up his 200th strikeout of 2013 , joining Hall of Famers Sandy Koufax and Don Drysdale as the only starters in Dodgers history with at least 4 consecutive seasons of more than 200 strikeouts . 
- Kershaw finished the season with a 16 @-@ 9 record , 236 innings pitched ( a career high ) , and a Major League best 1 @.@ 83 ERA and 0 @.@ 92 WHIP . He was the third player in history to lead the Majors in ERA three years in a row , joining Greg Maddux ( 1993 – 95 ) and Lefty Grove ( 1929 – 31 ) . His ERA was the first sub @-@ 2 @.@ 00 ERA since Roger Clemens did it in the 2005 season and the lowest overall since Pedro Martínez in the 2000 season . He was only the third Dodger pitcher to have an ERA under 3 @.@ 00 in five consecutive seasons ( Koufax and Nap Rucker ) . 
- Kershaw struck out 12 batters in seven innings in the first game of the 2013 National League Division Series . That was the third most strikeouts by a Dodger pitcher in the playoffs , behind only Koufax ( 15 in the 1963 World Series ) and Carl Erskine ( 14 in the 1953 World Series ) . His six straight strikeouts in the game tied a MLB postseason record set by Tim Belcher in the second game of the 1988 World Series . He picked up his first career postseason victory in that game . 
- Kershaw won the Warren Spahn Award for 2013 , the second time he had won the award , which honors the best left @-@ handed pitcher in the Major Leagues . He was also selected to the Sporting News NL All @-@ Star team , the fourth Dodger pitcher to be named to the team twice ( after Koufax , Valenzuela and Don Newcombe ) . On November 13 , he won the NL Cy Young Award for the second time in three seasons . He became just the sixth pitcher in history to finish in the top two in voting three seasons in a row . 
- After the season , Kershaw and the Dodgers agreed on a seven @-@ year , $ 215 million , contract extension . The deal was the richest in MLB history for a pitcher , eclipsing the seven @-@ year , $ 180 million , contract signed by Justin Verlander the previous year . The average annual value of $ 30 @.@ 7 million was also the largest ever for a baseball player , beating the $ 28 million Roger Clemens received in 2007 and the 10 @-@ year , $ 275 million contract that Alex Rodriguez signed that same year . 
- 
- = = = = 2014 season : MVP and 3rd Cy Young Award = = = = 
- 
- Kershaw made his fourth straight opening day start for the Dodgers in 2014 , only the fourth Dodger ever to do so . This season the game was played at the Sydney Cricket Ground in Australia . Before his second start , Kershaw felt some pain in his back and was placed on the disabled list for the first time in his career . He did not rejoin the Dodgers until early May . On June 18 , he pitched a complete game no @-@ hitter against the Colorado Rockies and struck out a career @-@ high 15 batters . The only batter to reach base was due to an error in the top of the seventh inning , costing Kershaw a perfect game . He is the only pitcher in MLB history with 15 strikeouts in a game while allowing no hits and no walks . Kershaw was 6 @-@ 0 with an 0 @.@ 82 ERA in June and was awarded with his third career Pitcher of the Month award . He was selected to the National League squad at the 2014 Major League Baseball All @-@ Star Game , his fourth straight selection . He was the sixth Dodger pitcher , and the first since Fernando Valenzuela to make the All @-@ Star team four years in a row . 
- Kershaw had a 41 inning scoreless inning streak that ended in the top of the sixth inning on July 10 when , with two outs , Chase Headley homered to left field at Dodger Stadium . Kershaw 's streak was , at the time , tied for the fifteenth longest scoreless inning streak in MLB history . He won the pitcher of the month award again in July , the third Dodger ( along with Don Sutton and Burt Hooton ) to win it two months in a row . He was 4 – 0 with a 1 @.@ 10 ERA in the month with 48 strikeouts and only 10 walks . He picked up his 200th strikeout of the season on September 2 , the fifth year in a row he had reached that number , trailing only the six seasons in a row for Sandy Koufax among Dodger starters . He also became just the fourth pitcher since 1893 to have at least five 200 @-@ strikeout seasons through an age @-@ 26 season ( Bert Blyleven , Walter Johnson and Sam McDowell are the others ) . 
- Kershaw finished the season 21 – 3 with a 1 @.@ 77 ERA in 27 starts . He led the National League in numerous categories once again , such as ERA , ERA + , Wins , Win % , WHIP , IP / GS , SO / 9 , Strikeout @-@ to @-@ walk ratio , complete games , FIP , and Wins Above Replacement for both pitchers and all NL players . He also finished third in strikeouts despite missing most of the first month of the season . He was the first pitcher in history to win four consecutive ERA titles . Many experts called his 2014 season one of the best pitching seasons in recent memory . 
- However , in his first start of the playoffs , in Game 1 of the Division Series against the Cardinals , Kershaw became the first pitcher in history to strike out 10 while allowing eight runs . He had cruised through the first six innings while allowing only two hits ( both solo homers ) and surrendered six runs in the seventh . He did tie Koufax for the only Dodgers pitchers with multiple double digit strikeout games in the playoffs . He was also the first pitcher in history to give up at least seven runs in back @-@ to @-@ back postseason starts ( his previous one was Game 6 of the 2013 National League Championship Series ) . Pitching on short rest in Game 4 , he would again be dominant , but again would take the loss after giving up a 3 @-@ run home run to Matt Adams in the 7th inning . It was the first home run Kershaw had allowed in his career to a left @-@ handed batter off his curveball . 
- Kershaw was honored after the season with player of the year awards from both The Sporting News and Baseball America . He won three awards at the Players Choice Awards including Outstanding NL Pitcher , Player of the Year and the Marvin Miller Man of the Year Award . He also won his third ( and second straight ) Warren Spahn Award . On November 12 , he was awarded his third Cy Young Award in four seasons ( a unanimous vote ) . The following day , he was elected as the NL MVP , the first National League pitcher to win the award since Bob Gibson in 1968 and the first Dodgers player to win the award since Kirk Gibson in 1988 . 
- 
- = = = = 2015 season : 300 @-@ strikeout season = = = = 
- 
- Kershaw made his fifth straight opening day start in 2015 , the first Dodgers pitcher to do so since Hall of Famer Don Sutton started seven in a row from 1972 through 1978 . He recorded his 1,500th career strikeout on May 10 when he fanned Drew Stubbs of the Colorado Rockies . Kershaw picked up his 100th career win on May 15 against the Rockies . He became the 22nd pitcher in franchise history and the second youngest active pitcher to reach that mark . Kershaw won his sixth career NL player of the week award for the week of June 1 – 7 , 2015 , when he allowed only two runs on 10 baserunners in 15 innings while striking out 18 in two starts that week . Kershaw did not make the initial NL roster for the 2015 All @-@ Star Game , though he was included on the Final Vote ballot , which he lost to Cardinals pitcher Carlos Martinez . However , he was added to the roster to replace Nationals pitcher Max Scherzer , who was unavailable due to pitching the Sunday before the game . It became his fifth straight all @-@ star selection , joining Sandy Koufax and Fernando Valenzuela as the only Dodgers pitchers to accomplish that feat . Kershaw struck out a season high 14 batters in eight shutout innings on July 18 against the Washington Nationals . He became the first Dodgers starter with back @-@ to @-@ back games of at least 13 strikeouts since Chan Ho Park in 2000 and the first Dodgers pitcher with back @-@ to @-@ back games of double @-@ digit strikeouts and no walks since Dazzy Vance in 1930 . He shared the NL player of the week honors with his teammate Zack Greinke for the week of July 13 – 19 and won NL pitcher of the month for July . 
- Kershaw picked up his 200th strikeout of the season on August 12 , tying Hideo Nomo 's 1995 season for the fastest to that mark in Dodgers history at 156 innings . This was the sixth straight 200 strikeout season for Kershaw , tying Sandy Koufax for the most in Dodgers franchise history . On October 4 , Kershaw became the 11th player in Major League history to strike out 300 batters in a season , the first player since Randy Johnson did it in 2002 . He finished the season with a 16 – 7 record , a 2 @.@ 13 ERA , and 301 strikeouts in 232 2 ⁄ 3 innings . 
- In Game One of the 2015 National League Division Series , Kershaw struck out 11 in 6 2 ⁄ 3 innings but allowed three runs for his fifth straight postseason loss . He and New York Mets starter Jacob deGrom were the first pair of starters to each throw at least 11 strikeouts in the same postseason game in MLB history . He rebounded in game four , earning the win on three days ' rest by allowing one run and three hits against eight strikeouts in seven innings on October 13 . Kershaw finished third in the National League Cy Young Award voting , behind Jake Arrieta and teammate Zack Greinke . In mid @-@ December 2015 , Kershaw participated in an expedition to Cuba composed of MLB officials and players , including former Dodgers manager Joe Torre . It was the first visit by MLB since 1999 , and one anticipated as an important step to help normalize relations with the United States that had begun to ease earlier in the year . 
- 
- = = = = 2016 = = = = 
- 
- Kershaw made his sixth straight opening day start in 2016 as the Dodgers won 15 – 0 . It also marked the first time the Dodgers had won six straight opening day games , all of which he started . On May 12 against the New York Mets , he struck out 13 while pitching a three @-@ hit complete game shutout . He set an MLB record with six consecutive starts with at least 10 strikeouts and no more than one walk and a club record with six consecutive starts with at least 10 strikeouts . He picked up his 100th strikeout on May 29 , while only walking five batters within that period . That was the lowest walk total for a pitcher reaching 100 strikeouts in the modern era , beating Cliff Lee who had seven walks in the 2010 season . On June 30 , 2016 , Kershaw was placed on the 15 @-@ day disabled list due to back pain . He received an MRI , which revealed that there was a mild herniated disc in the back , and received an epidural injection to treat the pain . He was named to the 2016 All @-@ Star team but was unable to pitch in the game due to his injury . On July 20 , the Dodgers shut down Kershaw for an indefinite period of time . He continued to feel discomfort in his back after a simulated game . 
- 
- = = Pitching style = = 
- 
- Kershaw 's pitching style relies on deception . He keeps the ball hidden so that it is hard for the batter to pick up the ball and has a consistent overhand delivery on all of his pitches . Out of the windup , Kershaw lowers his right foot vertically with a slight pause before moving it forward toward the plate . The motion was described during the 2015 National League Division Series as a " kickstand move " drawing comparison with one setting a kickstand on a bicycle . Out of the stretch , he uses a slide step as it makes it difficult for the base runner at first base to get a read on him . He has stated many times that he has modeled his pitching mechanics after his favorite pitcher growing up , Roger Clemens . 
- Kershaw 's repertoire includes a four @-@ seam fastball that sits anywhere from 92 miles per hour ( 148 km / h ) to 95 miles per hour ( 153 km / h ) and tops out at 98 miles per hour ( 158 km / h ) with late movement , a slider at 84 miles per hour ( 135 km / h ) – 87 miles per hour ( 140 km / h ) , a 12 – 6 curveball between 72 miles per hour ( 116 km / h ) – 76 miles per hour ( 122 km / h ) , and a seldom thrown changeup ( under 3 % ) . As of late in the 2015 season , he is believed to be experimenting with the use of a cutter . He is also known for having one of the better pickoff moves to first base and is considered one of the better fielding pitchers in the game . 
- 
- = = = Preparation = = = 
- 
- According to many teammates , Kershaw is a noted perfectionist . A.J. Ellis describes his preparation and perfectionism during bullpens before each start : 
- Three fastballs when I 'm standing up . I sit , and three fastballs down the middle . Then three fastballs either side . Three changeups away . Fastball inside . Three curveballs to the middle . Fastball inside . Three sliders to the middle . Then he goes to the stretch position . Two fastballs inside , two fastballs away , two changeups , one fastball inside , two curveballs , one fastball inside , two sliders . Back to the windup , and one fastball inside , one fastball away . Thirty @-@ four pitches in all . 
- 
- = = Awards and accomplishments = = 
- 
- 
- = = = Awards = = = 
- 
- 
- = = = Annual statistical achievements = = = 
- 
- Notes : Through 2015 season . Per Baseball @-@ Reference.com. 
- 
- = = Personal life = = 
- 
- Kershaw grew up in Dallas , Texas and attended school with quarterback Matthew Stafford and fellow pitchers Jordan Walden and Shawn Tolleson . One of his favorite players growing up was former Texas Rangers first baseman Will Clark , and the main reason he wears number 22 is to honor Clark . 
- He is the great @-@ nephew of astronomer Clyde Tombaugh , the discoverer of Pluto . Kershaw 's mother , born Marianne Tombaugh , is the daughter of Clyde Tombaugh 's younger brother . His father , Christopher George Kershaw , was a musician and won a Clio Award for his work . The elder Kershaw remarried after his divorce from Marianne and died in 2013 . 
- On December 4 , 2010 , Kershaw married his girlfriend of seven years , Ellen Melson . On January 23 , 2015 , Ellen gave birth to the couple 's first child , daughter Cali Ann . He is a Methodist with strong religious faith . 
- Kershaw made a cameo appearance in " Prince " , a Season 3 episode of New Girl which originally aired following FOX 's telecast of Super Bowl XLVIII . 
- 
- = = = Humanitarian work = = = 
- 
- Prior to the 2011 season , Kershaw visited Zambia with his wife as part of a Christian mission organized by Dallas @-@ based Arise Africa . After the trip , Kershaw announced his dream of building an orphanage in Lusaka , Zambia , which he called " Hope 's Home " after 11 @-@ year @-@ old Hope , an HIV @-@ positive child Kershaw met while in Zambia . To accomplish his goal , Kershaw pledged a donation of $ 100 per strikeout recorded in 2011 . With Kershaw 's career high of 248 strikeouts thrown during the 2011 season , he donated $ 492 @,@ 300 toward his $ 70 @,@ 000 goal . When Kershaw won the 2011 Players Choice Award , he donated $ 260 @,@ 000 to Hope 's Home . He and his wife returned to Zambia in 2012 . Kershaw donated $ 100 for every strikeout in the 2012 season to Kershaw 's Challenge , calling that season 's incarnation of the project " Strike Out To Serve . " Seventy percent of the money raised in 2012 went to Arise Africa , with 10 percent each going to the Peacock Foundation in Los Angeles , Mercy Street in Dallas , and I Am Second . In 2014 , Kershaw continued to support the children of Zambia , with partnership with CURE International , raising funds to pay for 170 children 's surgeries and new medical equipment for CURE hospital in Lusaka . Kershaw has continued his partnership with CURE International in 2015 , setting a goal of funding 100 surgeries for CURE 's hospital in the Dominican Republic . 
- In addition to Hope 's Home and Kershaw 's Challenge , he has also helped with other programs in Los Angeles , such as helping Habitat for Humanity demolish and rehabilitate a house in Lynwood , California . He is also a supporter of the Peacock Foundation , which provides animal @-@ assisted interventions and activities for at risk youth by partnering with mental health practitioners , public service agencies and community organizations . 
- 
- = = = Author = = = 
- 
- Kershaw and his wife , Ellen , co @-@ authored a book named Arise : Live Out Your Faith and Dreams on Whatever Field You Find Yourself about their Christian faith and their humanitarian efforts . The book was released on January 10 , 2012 through Regal Press . 
- 
- = = = Endorsements = = = 
- 
- Kershaw is a celebrity endorser for www.FantasyDraft.com , Wilson Sporting Goods ( glove ) , Under Armour ( shoes ) , Muscle Milk , and Subway . 
- 
- 
- = Josepha Petrick Kemarre = 
- 
- Josepha Petrick Kemarre ( born ca . 1945 or ca . 1953 , date uncertain ) is an Anmatyerre @-@ speaking Indigenous Australian from Central Australia . Since first taking up painting around 1990 , her works of contemporary Indigenous Australian art have been acquired by several major collections including Artbank and the National Gallery of Victoria . Her paintings portray bush plum " dreaming " and women ’ s ceremonies ( known as Awelye ) . One of her paintings sold at a charity auction for A $ 22 @,@ 800 . Josepha Petrick 's works are strongly coloured and formalist in composition and regularly appear at commercial art auctions in Australia . Her art appears to have survived the huge contraction of the primary art market in Australia since 2008 . There is no existing Catalogue raisonné of Josepha Petrick 's artworks , to date , no fakes have been cited . 
- 
- = = Personal background = = 
- 
- Josepha Petrick Kemarre is an Anmatyerre @-@ speaking Indigenous Australian , born around 1945 or 1953 at the Santa Teresa Mission , near Alice Springs in Australia 's Northern Territory . 
- When Josepha Petrick began painting for Mbantua Gallery in central Australia , she indicated that her name was Josepha rather than Josie , and that this was how she henceforth wished to be known ; however Mbantua 's biography is the only source that has used that version of her name . 
- After marrying Robin Petyarre , brother of artist Gloria Petyarre , Josepha Petrick moved to the region of Utopia , north @-@ east of Alice Springs , which is where she was living when she began painting around 1990 . They had seven children , one of whom , Damien Petrick , went on to become an artist like his mother . By 2008 , Josie Petrick 's husband had died , and Petrick was dividing her time between Alice Springs and Harts Range , to its north @-@ east . 
- 
- = = Professional background = = 
- 
- Contemporary Indigenous art of the western desert began in 1971 when Indigenous men at Papunya created murals and canvases using western art materials , assisted by teacher Geoffrey Bardon . Their work , which used acrylic paints to create designs representing body painting and ground sculptures , rapidly spread across Indigenous communities of central Australia , particularly after the introduction of a government @-@ sanctioned art program in central Australia in 1983 . By the 1980s and ' 90s , such work was being exhibited internationally . The first artists , including all of the founders of the Papunya Tula artists ' company , were men , and there was resistance among the Pintupi men of central Australia to women also painting . However , many of the women wished to participate , and in the 1990s many of them began to paint . In the western desert communities such as Utopia , Kintore , Yuendumu , Balgo , and on the outstations , people were beginning to create art works expressly for exhibition and sale . 
- 
- = = = Career = = = 
- 
- Josepha Petrick began painting about 1990 or 1992 as part of the contemporary Indigenous art movement that had begun at Papunya in the 1970s . By 1998 her work was being collected by both private and public institutions , such as Charles Sturt University , and in 2005 a work was purchased by the National Gallery of Victoria . Her career received a significant boost when her work was included in the National Gallery of Victoria 's 2006 Landmarks exhibition and its catalogue ; her painting was printed opposite that of Yannima Tommy Watson , who was by this time famous , particularly for his contribution to the design of a new building for the Musée du quai Branly . Petrick 's paintings have been included at exhibitions in several private galleries in Melbourne and Hong Kong , as well as at the Australian embassy in Washington in 2001 . 
- In 2006 a commissioned work by Petrick was exhibited at Shalom College at the University of New South Wales as part of a charity fundraising exhibition . It sold for A $ 22 @,@ 000 . As of the end of 2008 , the highest recorded auction price for an item of Petrick 's work was $ 22 @,@ 800 , set in May 2007 . An image based on a triptych by Petrick , Bush Berries , appears on the cover of a book on the visual perception of motion , Motion Vision . 
- Central Australian artists frequently paint particular " dreamings " , or stories , for which they have responsibility or rights . These stories are used to pass " important knowledge , cultural values and belief systems " from generation to generation . Paintings by Petrick portray two different groups of dreamings , rendered in two distinct styles . Bush plum dreaming represents a plant of the central Australian desert which is " a source of physical and spiritual sustenance , reminding [ the local Indigenous people ] of the sacredness of [ their ] country " . These paintings are undertaken with red , blue and orange dots that represent the fruit at different stages in its development . She also paints women ’ s ceremonies ( Awelye ) and dreamings , and these are created using rows of coloured dots and include representations of women 's ceremonial iconography . 
- Journalist Zelda Cawthorne described Petrick as one of the " finest contemporary Aboriginal artists " . Art consultant Adrian Newstead has ranked her as amongst the country 's top 200 Indigenous artists , noting that she has become " known for innovative works that create a sense of visual harmony through fine variegated fields of immaculately applied dotting " . Her style is described by Indigenous art writers Birnberg and Kreczmanski as an " interesting , modern interpretation of landscape " . 
- Petrick 's work is held in a variety of public and private collections , including Artbank , the Charles Sturt University Collection , the Holmes a Court Collection , and the National Gallery of Victoria . 
- 
- 
- = Head VI = 
- 
- Head VI is an oil @-@ on @-@ canvas painting by the Irish @-@ born English figurative artist Francis Bacon , the last of six panels making up his " 1949 Head " series . It shows a bust view of a single figure , modeled on Diego Velázquez 's Portrait of Innocent X. Bacon applies forceful , expressive brush strokes , and places the figure within a glass cage structure , behind curtain @-@ like drapery . This gives the effect of a man trapped and suffocated by his surroundings , screaming into an airless void . 
- Head VI was the first of Bacon 's paintings to reference Velázquez , whose portrait of Pope Innocent X haunted him throughout his career and inspired his series of " screaming popes " , a loose series of which there are around 45 surviving individual works . Head VI contains many motifs that were to reappear in Bacon 's work . The hanging object , which may be a light switch or curtain tassel , can be found even in his late paintings . The geometric cage is a motif that appears as late as his 1985 – 86 masterpiece , Study for a Self @-@ Portrait — Triptych . 
- Head VI was first exhibited in November 1949 at the Hanover Gallery in London , in a showing organised by one of the artist 's early champions , Erica Brausen . At the time , Bacon was a highly controversial but respected artist , best known for his 1944 Three Studies for Figures at the Base of a Crucifixion , which made him the enfant terrible of British art . Head VI drew a mixed reaction from art critics ; John Russell , later Bacon 's biographer , at the time dismissed it as a cross between " an alligator shorn of its jaws and an accountant in pince @-@ nez who has come to a bad end " . In 1989 Lawrence Gowing wrote that the " shock of the picture , when it was seen with a whole series of heads ... was indescribable . It was everything unpardonable . The paradoxical appearance at once of pastiche and iconoclasm was indeed one of Bacon 's most original strokes . " Art critic and curator David Sylvester described it as a seminal piece from Bacon 's unusually productive 1949 – 50 period , and one of Bacon 's finest popes . 
- 
- = = 1949 Head series = = 
- 
- Bacon 's output is characterised by sequences of images . He told Sylvester that his imagination was stimulated by sequences and that " images breed other images in me " . His series were not always planned or painted in sequence ; sometimes paintings are grouped for convenience but vary in execution and tone . The idea for the head series came after he returned penniless , late in 1948 , from a stay in Tangier . In the previous three years he had been unable to find a voice ; the last surviving canvas from this period is his Painting ( 1946 ) . Although he continued to paint , he was a ruthless self critic , given to slashing canvases with blades , and no works survive from between 1947 and the winter of 1948 . Gallerist Erica Brausen offered Bacon the opportunity of a solo show for the opening of her new Hanover Gallery . He agreed , but had nothing in reserve to hang . In following years , Brausen became perhaps the most important of Bacon 's early champions ; she arranged this showing — his debut solo exhibition — publicised him widely and organised viewings for international buyers . 
- Already 40 years old , Bacon viewed the exhibition as his last chance and applied himself to the task with determination . Because he had destroyed all his out of the last three years , he had little choice but to present new works . He did not have a grand plan when he agreed to the show , but eventually found themes that interested him in his Head I of the previous year , and executed five progressively stronger variants in the final weeks before the November exhibition , completing the series barely in time for the opening . 
- The paintings depict isolated figures enclosed in spaces that are undefined , overwhelmingly claustrophobic , reductive and eerie . Coming early in Bacon 's career , they are uneven in quality , but show a clear progression especially in how they utilise and present ideas he was still clearly developing and coming to terms with . Head I ( actually begun in the winter of 1948 ) and Head II show formless pieces of flesh that broadly resemble human heads ; they have half @-@ open eyes and a pharynx , though it is positioned much higher than would be expected in a human . Heads III , IV and V show fully formed busts recognisable as men , and are characterised by a haunted atmosphere . These two broad ideas coalesce in Head VI , which is as physiologically tortured as the first two paintings , and as spectral as the middle three . In Head VI the figure has developed and is now shown wearing vestments , the first indication in Bacon 's work of the influence of Velázquez , while the focus has become the open mouth and the study of the human scream . 
- Bacon said that chance played a significant role in his work , and that he often approached a canvas without having a clear idea of what might emerge . This was especially the case in the mid to late 1940s , a period when he was drinking heavily and spending most nights in Soho casinos and poker rooms . The following morning he would often approach his canvas " in a bad mood of drinking ... under tremendous hangovers and drink ; I sometimes hardly knew what I was doing . " He incorporated his appetite for chance into his work : an image often would morph mid @-@ way through into something quite different from what he had first intended . He actively sought out this freedom and felt it crucial to his progression as an artist . To him , lifestyle and art were intertwined ; he said that " perhaps the drink helped me to be a bit freer . " This is very evident in the 1949 series , which began as a rather morbid study of a collapsed head , but evolved over the six surviving panels into a reworking of Velázquez masterpieces , and arrived at an image that was to preoccupy Bacon for the subsequent 20 years . 
- The series marks Bacon 's first attempt at depicting lone figures in rooms . For him , the key aspect was that it appeared that the subject felt isolated , unobserved , and had abandoned the need to present an outward face . He believed that under these circumstances all pretence falls away , and the social being becomes the sum of its neuroses , which Bacon attempted to convey by reducing the subject to its bare @-@ bones features : a mouth , ears , eyes , a jaw . According to Russell , " the view out front ceases to be the only one , and our person is suddenly adrift , fragmented , and subject to strange mutation . " Russell observed that while the depiction of figures in rooms is common through all eras of painting , the figures are always posed , and usually seemingly aware that they are being portrayed . This conceit is abandoned in Bacon 's series . 
- Head I , completed late in 1948 , is considered more successful than Head II . Although it is well @-@ regarded critically , Head II is seen as something of a creative cul @-@ de @-@ sac , while Heads III , IV and V are usually considered as merely intermediate steps towards Head VI . It is exceptional in Bacon 's oeuvre that works of their relative poor quality survive ; he was ruthlessly self @-@ critical and often slashed or abandoned canvasses before they were completed . When pressed again by Brausen in 1953 to produce works for a New York show that she had been publicising for a year , he was full of doubt and destroyed most of what he had been working on , including several other popes . 
- Brausen commissioned another showing to be held in 1950 , for which Bacon painted three large popes modelled on Velázquez 's portrait . The gallery advertised the show as " Francis Bacon : Three Studies from the Painting of Innocent X by Velázquez " , but in the end Bacon was dissatisfied with the works and destroyed them before the show opened . 
- 
- = = Description = = 
- 
- The figure is clearly identifiable as a pope from his clothing . It seems trapped and isolated within the outlines of an abstract three @-@ dimensional glass cage . This framing device , described by Sylvester as a " space @-@ frame " , was to feature heavily throughout the artist 's career . A cord hangs from the upper edge of the glass case , falling just in front of the pope 's face and partially covering his eyes . It is too indistinctly drawn to identify with certainty , but given the presence of similar objects in Bacon 's later works , may be either the end of a hanging light switch or the tassel of a curtain ; the hanging cord was to become a signature for the artist . Apart from its symbolic meaning , it has a compositional function , framing the painting with a further set of vertical lines . Such an object reappears most prominently in the centre panel of his 1973 Triptych , May – June 1973 , where it is clearly a dangling light bulb . For Bacon , these elements were intended to make the figure waver in and out of sight for the viewer , alluding to the fact that bulbs can be on or off , curtains open or closed . 
- The figure 's mouth is opened wide as if screaming , an expression Bacon took from a still he kept of the nurse screaming in Sergei Eisenstein 's Odessa Steps massacre sequence in his 1925 silent film Battleship Potemkin . In 1984 , the broadcaster Melvyn Bragg asked Bacon about the still , and observed that in his earlier career the artist seemed preoccupied with the physicality of the human mouth . Bacon replied , " I had always thought that I would be able to make the mouth with all the beauty of a Monet landscape though I never succeeded in doing so . " When Bragg asked why he thought he had failed , Bacon said , " It should be all much more colour , should have got more of the interior of the mouth , with all the colours of the interior of the mouth , but I didn 't happen to get it . " His interest in the mouth was further stimulated by a medical textbook of diseased oral cavities bought in a second @-@ hand bookshop , kept in his studio and to which he often referred to . 
- The glass cage might imply a vacuum that the figure 's voice is unable to escape ; as if it is screaming in silence . Rueful later in life , Bacon said that he had " wanted to paint the scream more than the horror . I think , if I had really thought about what causes somebody to really scream , it would have made the scream ... more successful " . The work evokes memories of the Second World War . The glass enclosure of his 1949 Chicago Study for a Portrait is often seen as prophesying photographs of Adolf Eichmann 1961 trial before a Jerusalem District Court , when he was held within a similar cage . Bacon strongly resisted literal comparisons though , and stated that he used the device so he could frame and " really see the image – for no other reason . I know it 's been interpreted as being many other things . " Other critics saw similarities between the glass case and the radio booths of late 1930s broadcasters who warned against the impending calamity . Denis Farr notes that Bacon was sympathetic to George Orwell and referred in interviews to Orwellian " shouting voices ... and trembling hands ... convey [ ing ] the harsh atmosphere of an interrogation . " 
- 
- = = Influences = = 
- 
- The so @-@ called " space frame " had already been used by Alberto Giacometti in the 1930s , and the two artists became friends in the 1960s . However Giacometti had by 1949 used it only in surrealist contexts before Bacon 's adaption , and in turn influenced his use in " The Cage " of 1950 . A similar two dimensional construct is found in Henry Moore 's works , notably his " Maguette for King and Queen " , constructed three years after Bacon 's Head . It is difficult to untangle how these artists influenced and informed each other . What is notable is that Bacon continued to use the motif , with intervals until the end of his life . Sylvester suggests his finest example is the 1970 Three Studies of the Male Back . 
- The full @-@ length golden curtain @-@ like folds painted in heavy brush strokes are in part influenced by Degas but also similar to Titian 's 1558 Portrait of Cardinal Filippo Archinto . Bacon adapts the Old Master 's device to isolate and distance the sitter from the viewer ; the black ground @-@ paint is visible through the folds , making the separation all the more affecting . Bacon had already used similar forms in his Chicago panel , and they were to become a feature of his most acclaimed 1950s works , especially in his " screaming popes " . He became fascinated with the veil or curtain as a motif in painting , and collected many reproductions of works by Titian and Degas in which it is employed . He had begun his career as an interior decorator and designer of furniture and rugs in the mid @-@ 1930s , and later said that he liked " rooms hung all round with just curtains hung in even folds " . Veils or curtains appear in Bacon 's earliest works , notably the 1949 Study from the Human Body , always in portraits and always in front of , rather than behind , the figure . 
- Head VI is closely modelled on Velázquez 's c . 1650 Portrait of Innocent X , today in the Doria Pamphilj Gallery , Rome . Bacon cautiously avoided seeing the original , even when he spent three months in Rome in 1954 . Critics speculate he was afraid of being disappointed , or thought that an intimate knowledge of the painting would dull his imagination . Yet his fascination was all @-@ consuming and he reproduced variants of it obsessively for almost two decades ; an examination and homage described as " without parallel in the history of art " . Bacon 's approach differs to Velázquez 's in a number of ways : both artists were expressive , yet Bacon 's broad brush @-@ strokes and freedom with paint contrast with Velázquez 's tight and controlled treatment . He adapts Velázquez 's positioning of the pope to place him above the viewer 's point of view , elevating and distancing him . This was already a common technique in commercial , promotional photography but in Bacon 's hands , Schmied argues , the angle places the pope on a kind of stage for the viewer to coldly observe . 
- Although Bacon revered Velázquez 's portrait , he did not try and reproduce the earlier painting . In interviews , he said that he saw flaws in Velázquez 's work and that he viewed that social structure and order as , according to art historian Wieland Schmied , " obsolete and decayed " . Bacon 's approach was to elevate his subject so he could knock him down again , thereby making a sly comment on the treatment of royalty in both old master and contemporary painting . Yet Velázquez 's influence is apparent in many aspects of the painting . The sitter 's pose closely echoes the original , as does the violet and white colouring of his cope , which is built up through broad , thick , brush @-@ strokes . The influence can be further seen in the gold @-@ coloured ornaments on the back of the seat that extend on both sides of the figure . Art historian Armin Zweite describes the work as a mixture of reverence and subversion that pays tribute to Velázquez , while at the same time deconstructs his painting . 
- Sylvester detects the influence of late works by Titian in other aspects , especially in the deep and rich colouring , Velázquez 's portrayals of Philip IV , and agrees with identification of pastels of Edgar Degas as a source . He believes Bacon borrowed from Degas the use of parallel heavy folds to create the illusion of what Degas described as " shuttering " , as seen in the earlier artist 's After the Bath , Woman drying herself . Sylvester makes a further direct link between the folds and the transparent veil in Titian 's Portrait of Cardinal Filippo Archinto . He believes the folds serve to " push the viewer back " , creating a distance from the subject , an effect he sees as similar to the separation between and orchestra and setting ; others view the folds as more closely resembling the bars of a prison . Sylvester describes them as an accentuation of background verticals into stripes that are made to appear as if they pass through the sitter . In his " Interviews with Francis Bacon " series of books , he asked Bacon why he found the effect so poignant . The artist replied , " Well , it means that the sensation doesn 't come straight out at you but slides slowly and gently through the gaps . " 
- When asked why he was compelled to revisit the Velázquez so often , Bacon replied that he had nothing against popes per se , but merely sought " an excuse to use these colours , and you can 't give ordinary clothes that purple colour without getting into a sort of false fauve manner . " Schmied sees Head VI as a reaction against Velázquez , and a commentary on how the papacy is " obsolete and decayed " , with a pope resistant to both modernisation and secularisation . To him , the figure seems to " resist the maltreatment of image and tries to halt the impending collapse of the established work order . He screams and grimaces , clutching at arms of his throne . " Sylvester notes that Bacon was impressed by Picasso 's figuration and handling of paint , especially in Picasso 's 1930s works ; and suggests that the white blobs around the pope 's cape may be influenced by the 1913 Woman in a Slip Seated in an Armchair . 
- 
- = = Critical reception = = 
- 
- When Bacon undertook the series late in 1948 he was something of a two @-@ hit wonder . He had success in 1944 with Three Studies for Figures at the Base of a Crucifixion and to a lesser extent with Painting ( 1946 ) , both of which were highly regarded but viewed as sensationalist . The exhibition was a success , and marked his critical breakthrough . Until then , he had been highly regarded but capable of only occasional brilliance . The full show established him in the minds of critics as , according to Michael Peppiatt , " more of a force to be reckoned with in the contemporary scene " . While some found his images horrifying and unnerving , they wrote about him all the same , sealing his reputation as the enfant terrible of post @-@ war British art . The critic for The Observer wrote , " The recent paintings ... horrifying as they are , cannot be ignored . Technically they are superb , and the masterly handling of large areas of pearly grey , flushed with a sudden pink or green , only makes me regret the more that the artist 's gift should have been brought to subjects so esoteric " . 
- Most critics focused on Heads I and VI , remarking favourably on the progression between the two . While some found the inherent violence of the paintings distasteful , Brausen was a skilled publicist and turned the bad press into notoriety , and brought Bacon 's work to national attention . Peppiatt notes that the exhibition showed Bacon no longer needed sensationalist material to make an impact , and was now capable of creating an intense emotional response through more subtle means , and had found a way of presenting the human condition in the way he had sought , by presenting his sitter " in a vestigial setting , a cage or [ behind ] a parted curtin ... the rest , the most essential , lay in the manipulation of the infinitely suggestive medium of oil paint " . After the showing Bacon gradually became " less the outsider with an occasional image of horrifying brilliance and more a force to be reckoned with on the contemporary scene " . His reputation and the value of his panels rose dramatically , and after the showing he was sought after by European , American and African collectors and galleries , commanding prices as high as £ 400 for single works , unusual for a contemporary British artist of the time . 
- 
- = = Provenance = = 
- 
- Head VI was first exhibited at the Hanover Gallery , London , in 1949 . It was acquired by the Arts Council 's Hayward Gallery in 1952 . The Hayward has loaned it out a number of times since , including for major retrospectives at the Grand Palais , Paris in 1971 , and the Hugh Lane Gallery , Dublin , in 2000 . 
- In May 1996 , the National Gallery took on loan Velázquez 's Innocent X portrait and hung it alongside four Bacon paintings ; Head VI , Pope I ( 1951 ) , Pope 1961 and Pope 1965 . Peppiatt believes that Bacon would have disapproved of such a showing with a work he considered one of the finest ever painted , but writes that two , including Head VI , " stood up to it , and even enhanced its authority as one of the most penetrating studies of human nature and human power " . 
- 
- 
- = Imagism = 
- 
- Imagism was a movement in early 20th @-@ century Anglo @-@ American poetry that favored precision of imagery and clear , sharp language . 
- Imagism has been described as the most influential movement in English poetry since the activity of the Pre @-@ Raphaelites . As a poetic style it gave Modernism its start in the early 20th century , and is considered to be the first organized Modernist literary movement in the English language . Imagism is sometimes viewed as ' a succession of creative moments ' rather than any continuous or sustained period of development . René Taupin remarked that ' It is more accurate to consider Imagism not as a doctrine , nor even as a poetic school , but as the association of a few poets who were for a certain time in agreement on a small number of important principles ' . 
- The Imagists rejected the sentiment and discursiveness typical of much Romantic and Victorian poetry , in contrast to their contemporaries , the Georgian poets , who were generally content to work within that tradition . In contrast , Imagism called for a return to what were seen as more Classical values , such as directness of presentation and economy of language , as well as a willingness to experiment with non @-@ traditional verse forms . Imagists use free verse . 
- Imagist publications appearing between 1914 and 1917 featured works by many of the most prominent modernist figures , both in poetry and in other fields . The Imagist group was centered in London , with members from Great Britain , Ireland and the United States . Somewhat unusually for the time , a number of women writers were major Imagist figures . 
- A characteristic feature of Imagism is its attempt to isolate a single image to reveal its essence . This feature mirrors contemporary developments in avant @-@ garde art , especially Cubism . Although Imagism isolates objects through the use of what Ezra Pound called " luminous details " , Pound 's Ideogrammic Method of juxtaposing concrete instances to express an abstraction is similar to Cubism 's manner of synthesizing multiple perspectives into a single image . 
- 
- = = Pre @-@ Imagism = = 
- 
- Well @-@ known poets of the Edwardian era of the 1890s , such as Alfred Austin , Stephen Phillips , and William Watson , had been working very much in the shadow of Tennyson , producing weak imitations of the poetry of the Victorian era . They continued to work in this vein into the early years of the 20th century . As the new century opened , Austin was still the serving British Poet Laureate , a post which he held up to 1913 . In the century 's first decade , poetry still had a large audience ; volumes of verse published in that time included Thomas Hardy 's The Dynasts , Christina Rossetti 's posthumous Poetical Works , Ernest Dowson 's Poems , George Meredith 's Last Poems , Robert Service 's Ballads of a Cheechako and John Masefield 's Ballads and Poems . Future Nobel Prize winner William Butler Yeats was devoting much of his energy to the Abbey Theatre and writing for the stage , producing relatively little lyric poetry during this period . In 1907 , the Nobel Prize for Literature was awarded to Rudyard Kipling . 
- The origins of Imagism are to be found in two poems , Autumn and A City Sunset by T. E. Hulme . These were published in January 1909 by the Poets ' Club in London in a booklet called For Christmas MDCCCCVIII . Hulme was a student of mathematics and philosophy ; he had been involved in the setting up of the club in 1908 and was its first secretary . Around the end of 1908 , he presented his paper A Lecture on Modern Poetry at one of the club 's meetings . Writing in A. R. Orage 's magazine The New Age , the poet and critic F. S. Flint ( a champion of free verse and modern French poetry ) was highly critical of the club and its publications . From the ensuing debate , Hulme and Flint became close friends . In 1909 , Hulme left the Poets ' Club and started meeting with Flint and other poets in a new group which Hulme referred to as the " Secession Club " ; they met at the Eiffel Tower restaurant in London 's Soho to discuss plans to reform contemporary poetry through free verse and the tanka and haiku and the removal of all unnecessary verbiage from poems . The interest in Japanese verse forms can be placed in a context of the late Victorian and Edwardian revival of interest in Chinoiserie and Japonism as witnessed in the 1890s vogue for William Anderson 's Japanese prints donated to the British Museum , performances of Noh plays in London , and the success of Gilbert and Sullivan 's operetta The Mikado ( 1885 ) . Direct literary models were available from a number of sources , including F. V. Dickins 's 1866 Hyak nin is 'shiu , or , Stanzas by a Century of Poets , Being Japanese Lyrical Odes , the first English @-@ language version of the Hyakunin isshu , a 13th @-@ century anthology of 100 waka , the early 20th @-@ century critical writings and poems of Sadakichi Hartmann , and contemporary French @-@ language translations . 
- The American poet Ezra Pound was introduced to the group in April 1909 and found that their ideas were close to his own . In particular , Pound 's studies of Romantic literature had led him to an admiration of the condensed , direct expression that he detected in the writings of Arnaut Daniel , Dante , and Guido Cavalcanti , amongst others . For example , in his 1911 – 12 series of essays I gather the limbs of Osiris , Pound writes of Daniel 's line " pensar de lieis m 'es repaus " ( " it rests me to think of her " ) ( from the canzone En breu brizara 'l temps braus ) : " You cannot get statement simpler than that , or clearer , or less rhetorical " . These criteria of directness , clarity and lack of rhetoric were to be amongst the defining qualities of Imagist poetry . Through his friendship with Laurence Binyon , Pound had already developed an interest in Japanese art by examining Nishiki @-@ e prints at the British Museum , and he quickly became absorbed in the study of related Japanese verse forms . 
- In an article in La France , 1915 , the French critic , Remy de Gourmont described the Imagists as descendants of the French Symbolistes and in a 1928 letter to the French critic and translator René Taupin , Pound was keen to emphasise another ancestry for Imagism , pointing out that Hulme was indebted to a Symbolist tradition , linking back via William Butler Yeats , Arthur Symons and the Rhymers ' Club generation of British poets to Mallarmé. and the Symbolist source was amplified further in Taupin 's study published in 1929 , in which he concluded however great the divergence of technique and language ' between the image of the Imagist and the ' symbol ' of the Symbolists there is a difference only of precision ' . In 1915 , Pound edited the poetry of another 1890s poet , Lionel Johnson for the publisher Elkin Mathews . In his introduction , he wrote 
- 
- = = Early publications and statements of intent = = 
- 
- In 1911 , Pound introduced two other poets to the Eiffel Tower group : his former fiancée Hilda Doolittle ( who had started signing her work H.D. ) and her future husband Richard Aldington . These two were interested in exploring Greek poetic models , especially Sappho , an interest that Pound shared . The compression of expression that they achieved by following the Greek example complemented the proto @-@ Imagist interest in Japanese poetry , and , in 1912 , during a meeting with them in the British Museum tea room , Pound told H.D. and Aldington that they were Imagistes and even appended the signature H.D. Imagiste to some poems they were discussing . 
- When Harriet Monroe started her Poetry magazine in 1911 , she had asked Pound to act as foreign editor . In October 1912 , he submitted thereto three poems each by H.D. and Aldington under the Imagiste rubric , ( published in the November 1912 second issue thereof ) with a note which described Aldington as ' one of the ' Imagistes ' . This note , along with the appendix note ( ' The Complete Works of T. S. Hulme ' ) in Pound 's book ( also published in Autumn 1912 ) entitled Ripostes are considered to be first appearances of the word Imagiste ( later anglicised to ' Imagists ' ) in print . 
- Aldington 's poems , Choricos , To a Greek Marble , and Au Vieux Jardin , were in the November issue of Poetry , and H.D. ' s , Hermes of the Ways , Priapus , and Epigram , appeared in the January 1913 issue ; Imagism as a movement was launched . Poetry 's April issue published what came to be seen as " Imagism 's enabling text " , the haiku @-@ like poem of Ezra Pound entitled " In a Station of the Metro " : 
- The apparition of these faces in the crowd ; 
- Petals on a wet , black bough . 
- The March 1913 issue of Poetry contained A Few Don 'ts by an Imagiste and the essay entitled Imagisme both written by Pound , with the latter being attributed to Flint . The latter contained this succinct statement of the group 's position : 
- Direct treatment of the " thing " , whether subjective or objective . 
- To use absolutely no word that does not contribute to the presentation . 
- As regarding rhythm : to compose in sequence of the musical phrase , not in sequence of the metronome . 
- Pound 's note opened with a definition of an image as " that which presents an intellectual and emotional complex in an instant of time " . Pound goes on to state , " It is better to present one Image in a lifetime than to produce voluminous works " . His list of " don 'ts " reinforced his three statements in " Imagism " , while warning that they should not be considered as dogma but as the " result of long contemplation " . Taken together , these two texts comprised the Imagist programme for a return to what they saw as the best poetic practice of the past . F.S. Flint commented " we have never claimed to have invented the moon . We do not pretend that our ideas are original . " 
- The 1916 preface to Some Imagist Poets comments " Imagism does not merely mean the presentation of pictures . Imagism refers to the manner of presentation , not to the subject . " 
- 
- = = Des Imagistes = = 
- 
- Determined to promote the work of the Imagists , and particularly of Aldington and H.D. , Pound decided to publish an anthology under the title Des Imagistes . It was first published in Alfred Kreymborg 's little magazine The Glebe and was later published in 1914 by Alfred and Charles Boni in New York and by Harold Monro at the Poetry Bookshop in London . It became one of the most important and influential English @-@ language collections of modernist verse . Included in the thirty @-@ seven poems were ten poems by Aldington , seven by H.D. , and six by Pound . The book also included work by F.S. Flint , Skipwith Cannell , Amy Lowell , William Carlos Williams , James Joyce , Ford Madox Ford , Allen Upward and John Cournos.Max Michelson was also another included in the important 1963 anthology by William Pratt The Imagist Poem Modern Poetry in miniature . 
- Pound 's editorial choices were based on what he saw as the degree of sympathy that these writers displayed with Imagist precepts , rather than active participation in a group as such . Williams , who was based in the United States , had not participated in any of the discussions of the Eiffel Tower group . However , he and Pound had long been corresponding on the question of the renewal of poetry along similar lines . Ford was included at least partly because of his strong influence on Pound , as the younger poet made the transition from his earlier , Pre @-@ Raphaelite @-@ influenced style towards a harder , more modern way of writing . The inclusion of a poem by Joyce , I Hear an Army , which was sent to Pound by W.B. Yeats , took on a wider importance in the history of literary modernism , as the subsequent correspondence between the two led to the serial publication , at Pound 's behest , of A Portrait of the Artist as a Young Man in The Egoist . Joyce 's poem is not written in free verse , but in rhyming quatrains . However , it strongly reflects Pound 's interest in poems written to be sung to music , such as those by the troubadours and Guido Cavalcanti . The book met with little popular or critical success , at least partly because it had no introduction or commentary to explain what the poets were attempting to do , and a number of copies were returned to the publisher . 
- 
- = = Some Imagist Poets = = 
- 
- The following year , Pound and Flint fell out over their different interpretations of the history and goals of the group arising from an article on the history of Imagism written by Flint and published in The Egoist in May 1915 . Flint was at pains to emphasise the contribution of the Eiffel Tower poets , especially Edward Storer . Pound , who believed that the " Hellenic hardness " that he saw as the distinguishing quality of the poems of H.D. and Aldington was likely to be diluted by the " custard " of Storer , was to play no further direct role in the history of the Imagists . He went on to co @-@ found the Vorticists with his friend , the painter and writer Wyndham Lewis . 
- Around this time , the American Imagist Amy Lowell moved to London , determined to promote her own work and that of the other Imagist poets . Lowell was a wealthy heiress from Boston whose brother Abbott Lawrence Lowell was President of Harvard University from 1909 @-@ 1933 . She loved Keats and cigars . She was also an enthusiastic champion of literary experiment who was willing to use her money to publish the group . Lowell was determined to change the method of selection from Pound 's autocratic editorial attitude to a more democratic manner . This new editorial policy was stated in the Preface to the first anthology to appear under her leadership : " In this new book we have followed a slightly different arrangement to that of our former Anthology . Instead of an arbitrary selection by an editor , each poet has been permitted to represent himself by the work he considers his best , the only stipulation being that it should not yet have appeared in book form . " The outcome was a series of Imagist anthologies under the title Some Imagist Poets . The first of these appeared in 1915 , planned and assembled mainly by H.D. and Aldington . Two further issues , both edited by Lowell , were published in 1916 and 1917 . These three volumes featured most of the original poets , ( also including imagist poetry by the American poet John Gould Fletcher ) , with the exception of Pound , who had tried to persuade her to drop the Imagist name from her publications and who sardonically dubbed this phase of Imagism " Amy @-@ gism . " 
- Lowell persuaded D. H. Lawrence to contribute poems to the 1915 and 1916 volumes , making him the only writer to publish as both a Georgian poet and an Imagist . Marianne Moore also became associated with the group during this period . However , with World War I as a backdrop , the times were not easy for avant @-@ garde literary movements ( Aldington , for example , spent much of the war at the front ) , and the 1917 anthology effectively marked the end of the Imagists as a movement . 
- 
- = = Imagists after Imagism = = 
- 
- In 1929 , Walter Lowenfels jokingly suggested that Aldington should produce a new Imagist anthology . Aldington , by now a successful novelist , took up the suggestion and enlisted the help of Ford and H.D. The result was the Imagist Anthology 1930 , edited by Aldington and including all the contributors to the four earlier anthologies with the exception of Lowell , who had died , Cannell , who had disappeared , and Pound , who declined . The appearance of this anthology initiated a critical discussion of the place of the Imagists in the history of 20th @-@ century poetry . 
- Of the poets who were published in the various Imagist anthologies , Joyce , Lawrence and Aldington are now primarily remembered and read as novelists . Marianne Moore , who was at most a fringe member of the group , carved out a unique poetic style of her own that retained an Imagist concern with compression of language . William Carlos Williams developed his poetic along distinctly American lines with his variable foot and a diction he claimed was taken " from the mouths of Polish mothers " . Both Pound and H.D. turned to writing long poems , but retained much of the hard edge to their language as an Imagist legacy . Most of the other members of the group are largely forgotten outside the context of the history of Imagism . 
- 
- = = Legacy = = 
- 
- Despite the movement 's short life , Imagism would deeply influence the course of modernist poetry in English . Richard Aldington , in his 1941 memoir , writes : " I think the poems of Ezra Pound , D.H. , Lawrence , and Ford Madox Ford will continue to be read . And to a considerable extent T. S. Eliot and his followers have carried on their operations from positions won by the Imagists . " 
- On the other hand , Wallace Stevens found shortcomings in the Imagist approach : " Not all objects are equal . The vice of imagism was that it did not recognize this . " With its demand for hardness , clarity and precision and its insistence on fidelity to appearances coupled with its rejection of irrelevant subjective emotions Imagism had later effects that are demonstratable in T. S. Eliot 's ' Preludes ' and ' Morning at the Window ' and in D. H. Lawrence 's animal and flower pieces . The rejection of conventional verse forms in the nineteen @-@ twenties owed much to the Imagists repudiation of the Georgian Poetry style . 
- The influence of Imagism can be seen clearly in the work of the Objectivist poets , who came to prominence in the 1930s under the auspices of Pound and Williams . The Objectivists worked mainly in free verse . Clearly linking Objectivism 's principles with Imagism 's , Louis Zukofsky insisted , in his introduction to the 1931 Objectivist issue of Poetry , on writing " which is the detail , not mirage , of seeing , of thinking with the things as they exist , and of directing them along a line of melody . " Zukofsky was a major influence on the Language poets , who carried the Imagist focus on formal concerns to a high level of development . Basil Bunting , another Objectivist poet , was a key figure in the early development of the British Poetry Revival , a loose movement that also absorbed the influence of the San Francisco Renaissance poets . 
- Imagism influenced a number of poetry circles and movements.With the Imagists Free verse became a discipline and acquired status as a legitimate poetic form . In the 1950s , especially , with the Beat generation , the Black Mountain poets , and others associated with the San Francisco Renaissance . In his seminal 1950 essay Projective Verse , Charles Olson , the theorist of the Black Mountain group , wrote " ONE PERCEPTION MUST IMMEDIATELY AND DIRECTLY LEAD TO A FURTHER PERCEPTION " ; his credo derived from and supplemented the Imagists . 
- Among the Beats , Gary Snyder and Allen Ginsberg in particular were influenced by the Imagist emphasis on Chinese and Japanese poetry . William Carlos Williams was another who had a strong effect on the Beat poets , encouraging poets like Lew Welch and writing an introduction for the book publication of Ginsberg 's Howl ( 1955 ) . 
- 
- 
- = Operation Eastern Exit = 
- 
- Operation Eastern Exit was the codename given to the military evacuation of the United States embassy in Mogadishu , the capital of Somalia , in January 1991 . In late December 1990 , violence quickly enveloped the city as armed militants began clashing with government soldiers . On 1 January 1991 , the US Ambassador to Somalia , James Keough Bishop , contacted the Department of State requesting an evacuation of the embassy , which was approved the following day . United States Central Command began planning and mobilizing forces that evening . The initial plan was to evacuate with a military transport plane through the Mogadishu International Airport , but this was later abandoned . A helicopter evacuation via the USS Guam and USS Trenton was the remaining option . 
- On the morning of 5 January , a 60 @-@ person Marine and Navy SEAL security detail was dispatched from Guam aboard two CH @-@ 53E Super Stallion helicopters to secure the embassy and prepare for the main evacuation . The two helicopters returned to Guam with the first 61 evacuees . Throughout the day , foreign diplomats and civilians sought refuge at the embassy . Four waves of five CH @-@ 46 Sea Knight helicopters each evacuated the embassy compound shortly after midnight on 6 January . The evacuees were transported to Muscat , Oman , where they disembarked on 11 January . In total , 281 diplomats and civilians from 30 countries were evacuated , including 12 heads of missions ( eight ambassadors and four chargés d 'affaires ) . 
- 
- = = Background = = 
- 
- In the late 1980s , there was increasing rebellion against the rule of Somali President Siad Barre , a military dictator who maintained tight control of power and had a record of human rights abuses . By 1990 , what began as civil disobedience evolved into a civil war , with several militias organized to overthrow the central government . 
- In July 1989 , the embassy moved to a new , 80 @-@ acre ( 32 ha ) compound , 6 miles ( 9 @.@ 7 km ) from the previous embassy and James K. Bishop was appointed as the United States ' ambassador to Somalia . Ambassador Bishop had significant experience in crisis management at US embassies . In 1967 , he was at the US Embassy in Beirut , Lebanon when the Six @-@ Day War erupted . About 3 @,@ 600 Americans were evacuated in 33 hours ; Bishop was one of 26 diplomats and soldiers that remained in the city . As deputy assistant secretary of state for Africa from 1981 – 87 , Bishop chaired several task forces for crises and gained experience in the State Department 's operations center as evacuations were carried out during several coups d 'etat . During his previous assignment as Ambassador to Liberia ( 1987 – 90 ) , Bishop was overseeing the voluntary evacuation of embassy staff and civilians as a civil war in Liberia spread , when he left in March 1990 . Soon after returning to Washington to prepare for his new appointment to Somalia , he was appointed to a taskforce to deal with the crisis in Liberia , which included a gradual evacuation of American civilians and a rapid closure of the embassy in August . 
- On 1 August , before leaving the US to take up his post in Mogadishu , Ambassador Bishop visited United States Central Command — the military command for the Middle East and northeast Africa — where he spent most of the day with its commander , Gen. Norman Schwarzkopf . Ambassador Bishop , aware of the ongoing strife , believed " the odds were better than even that we would have to leave Mogadishu under less than favorable circumstances . " Ambassador Bishop understood from his past experiences in Beirut and Liberia the importance of being prepared to deal with emergencies and spent the afternoon working with military experts to review the embassy 's Emergencies and Evacuation ( E & E ) plan until he was " satisfied ... that [ Central Command ] realized that it might have to conduct an evacuation from Mogadishu and was prepared to do that . " In its analysis of Operation Eastern Exit , the Center for Naval Analyses cited the Ambassador Bishop 's previous experience and " clear understanding of his role " in the operation as one of the reasons Operation Eastern Exit went so well . 
- Hours after Ambassador Bishop 's visit to Central Command , Iraq invaded Kuwait . In 1979 , the US negotiated access to an airport and port in both Mogadishu and Berbera ; because of limited access the US had to locations in the Persian Gulf area , maintaining this access was a main interest for the Mogadishu embassy to pursue as the US mobilized to intervene in Kuwait . 
- An increasing level of criminal violence prompted Ambassador Bishop to request the voluntary evacuation of dependents ( e.g. children and spouses of staff ) and non @-@ essential staff in early December , although fighting between the government and the United Somali Congress ( a rebel militia ) remained no less than about 100 miles ( 160 km ) away . The voluntary evacuation later became a mandatory evacuation . By 19 December , the number of official US personnel in the city was reduced from 147 to 37 ; around the same time , fighting between the government and rebels came within about 40 miles ( 64 km ) of Mogadishu . 
- 
- = = = Collapse of the Barre government = = = 
- 
- On 30 December , violence escalated " an order of magnitude " as militants entered Mogadishu , which was quickly enveloped by a general state of lawlessness . On 30 – 31 December , diplomats , including many stationed in offices elsewhere in the city , were collected and housed in the embassy compound , except two volunteers who remained in the embassy 's K @-@ 7 residential apartments located across Afgoy Road from the embassy . The volunteers in the K @-@ 7 building would be needed as look @-@ outs for the embassy compound 's main gate . On the morning of 31 December , the defense attaché was nearly killed when his vehicle was sprayed with bullets and that evening , a soldier at a roadblock shot the tires of a vehicle carrying another defense official . Attempts by the US and other nations ' diplomats , in particular the Italian embassy , to negotiate a ceasefire for foreigners to leave were unsuccessful . Afgoy Road became a " shooting gallery , " preventing those in safe @-@ havens outside the embassy from reaching it . On New Year 's Day , the first American civilians began to seek refuge at the embassy . 
- Ambassador Bishop requested an evacuation of the American community on 1 January , indicating that the evacuation could be with the planned Italian , French , or German evacuation efforts , but preferred an evacuation by the US military . The State Department authorized the evacuation on 2 January and on that day , Ambassador Bishop specifically requested an evacuation by the US military , thereby initiating Operation Eastern Exit . Ambassador Bishop had spent a considerable amount of time discussing contingency plans for evacuation with other diplomatic posts . Ultimately , ten heads of missions — eight ambassadors and two chargés d 'affaires — along with their staff sought refuge in the US embassy compound and were evacuated . 
- 
- = = Plans , mobilization , and escalating violence = = 
- 
- Ambassador Bishop had visited Central Command in August 1990 , where he worked with military experts to update the embassy 's E & E plan . The first notice that an evacuation of the Mogadishu embassy would be needed came on the morning of 1 January , when the top naval commander at Central Command sent a message to his naval operations staff : " Better have Amphib crowd take a look at a helo NEO of Mogadishu ! time / distance to get there from Masirah OP area . " Following the ambassador 's 2 January evacuation request , the commander of Central Command ordered Air Force aircraft to the region , the movement of amphibious ships to Mogadishu , and requested United States Special Operations Command to prepare for a noncombatant evacuation operation . 
- The initial plan was to evacuate via Mogadishu International Airport . Soon after the evacuation request , the United States Air Force deployed C @-@ 130 transport planes and an AC @-@ 130 , for gunfire support , to Nairobi , Kenya , awaiting clearances to enter Somalia and the ability to safely transfer evacuees from the embassy to the airport . However , the US and other foreign embassies were unable to contact anyone within the government to obtain clearances . It also became apparent that the rebels had an ineffective command @-@ and @-@ control structure , making it impossible to negotiate any ceasefire or guarantee of safe passage . Likewise , government troops faced a command @-@ and @-@ control problem ; reports indicated that army units were separating along clan lines , in some cases soldiers shot officers of a different clan when given orders they disagreed with . Thus , it became clear that safe passage to the airport would not be possible . Several other nations also had aircraft mobilized to reach Mogadishu , but faced the same problems of landing and transit of evacuees to the airport . 
- On 4 January , several incidents , including a couple exchanges of gunfire , suggested that the embassy 's security detail was insufficient to hold off armed Somalis until the USS Guam and USS Trenton arrived with their helicopters and soldiers , at that time scheduled to arrive on 7 January . The embassy had just six Marine guards , whose job was limited to protecting the chancery . Ambassador Bishop made an urgent request to Washington for two platoons of soldiers to parachute into the embassy to defend it until the ships arrived . The request was denied , but the Ambassador was told that an advance element from the vessels would reach the embassy the following morning . 
- USS Guam and USS Trenton began transit from the coast of Oman towards Mogadishu at 22 : 30 ( 23 : 30 Oman time ) on 2 January . The commander of Amphibious Group Two had initially proposed a seven @-@ ship Amphibious Task Group , composed of vessels anchored at Masirah Island ( off Oman ) and Dubai and including four amphibious ships so that the full range of amphibious capabilities would be available for the operation . However , intervention in Kuwait seemed imminent and the commander of naval forces at Central Command did not want to divert that many ships from the Persian Gulf , thus the decision to send two of the closest ships . Although the two vessels were selected by mid @-@ afternoon on 2 January , the transfer of some personnel from Dubai to Masirah caused a delay of eight to ten hours . Guam and Trenton carried forces from the 4th Marine Expeditionary Brigade , including a detachment of CH @-@ 53E Super Stallion helicopters — the largest helicopters operated by the US military — and two squadrons of CH @-@ 46 Sea Knight helicopters . 
- Planning began in earnest as the ships got underway , with a combined command center on Guam . On the morning of 3 January , the task force 's command questioned why they were not given the option of an amphibious landing and requested a tank landing ship be added to the task force ; the request was denied . A warrant officer who had previously served as a Marine Security Guard at the Mogadishu embassy during the mid @-@ 1980s was found . Despite Ambassador Bishop 's planning with Central Command , the task force was provided outdated information . The former MSG told planners that a new embassy had been planned and was under construction several years prior . In fact , the new embassy was located further inland and , after receiving updated information , task force commanders determined that a beach landing , requiring troops to fight their way across the city , was too risky . Initial plans had the ships launch their helicopters at 01 : 00 on 7 January . However , in response to indications from Ambassador Bishop that conditions in Mogadishu were deteriorating , planners considered 1 @,@ 050 @-@ nautical @-@ mile ( 1 @,@ 940 km ; 1 @,@ 210 mi ) and , later , 890 @-@ nautical @-@ mile ( 1 @,@ 650 km ; 1 @,@ 020 mi ) flights with the CH @-@ 53Es while the ships were still located in the northern Arabian Sea . The situation in Mogadishu stabilized somewhat and the mission was delayed until 5 January . 
- 
- = = Evacuation = = 
- 
- On the evening of 4 January , the final execute order was issued for a 02 : 45 launch of two CH @-@ 53E Super Stallions to arrive at the embassy at dawn . The 60 soldiers selected for the security detail were issued weapons and ammunition . Two Marine Corps KC @-@ 130 refueling tankers were mobilized closer to the operation , from Bahrain to Oman , to refuel the helicopters en route to Mogadishu and the two helicopters transferred from Trenton to Guam . 
- 
- = = = Security detail and first evacuees = = = 
- 
- Two CH @-@ 53E Super Stallions carrying a 60 @-@ man security detail — 51 Marines and nine Navy SEALs — departed Guam at 02 : 47 , 466 nautical miles ( 863 km ; 536 mi ) from the embassy , and were expected to arrive at 06 : 20 . They performed two aerial refuelings . During the first refueling , a pipe burst on one of the helicopters , dousing soldiers in fuel and nearly forcing a return to the Guam ; problems with the helicopters ' navigation system also complicated the refueling rendezvous . The helicopters arrived in Mogadishu at dawn , crossing the coast just south of the harbor at 25 – 50 feet ( 7 @.@ 6 – 15 @.@ 2 m ) in altitude on a route that was planned to avoid areas of more intense violence reported in the northern parts of the city . On their arrival in Mogadishu , the crew of the helicopters were using an outdated 1969 map , which showed the embassy in an isolated area . Furthermore , they had been told the embassy could be discerned by its white stucco perimeter wall and golf course . The embassy was , in fact , surrounded by new development and the crew saw white stucco walls around many buildings in the city . The helicopters were flying too low to spot a strobe light which was placed on the embassy 's water tower ( the highest point within the embassy compound ) and the golf course in the embassy compound had a black , oil @-@ coated surface — not the familiar green grass that the helicopter crew would recognize.After breaking radio silence ( their only direct communication with the embassy was unencrypted ) to contact the embassy , they were able to discern it and land at 07 : 10 . As they arrived , a group of about 100 to 150 Somalis were attempting to enter the embassy compound via ladders on the wall , but scattered as the helicopters arrived . 
- The security detail moved to establish a perimeter around the embassy compound and the Air Force 's AC @-@ 130 arrived to provide overhead support . Ambassador Bishop gave the security detail clear instructions on the rules of engagement : they could only use deadly force if people came over the embassy compound 's walls with obvious hostile intent . He also identified three zones of defense , stating a preference to retreat to the third zone before the use of deadly force : 
- the entire embassy compound 
- the Chancery , Joint Administrative Office ( JAO ) building , Marine House , and the helicopter landing zone ( HLZ ) 
- the chancery and JAO buildings ( the two " safehaven " buildings where the evacuees were held ) 
- Ambassador Bishop clearly explained his rationale to the security detail , which was to avoid any impression that they were intervening in the violence in Mogadishu . He feared that the embassy would be targeted by organized attacks if any group involved in the clashes got the impression that the US was intervening in the conflict . To this effect , he requested the Voice of America and BBC broadcast announcements that the forces were present only to evacuate the embassy and would not interfere in the conflict . The Marines who had been doused in fuel during the refueling were able to take a shower and wash their clothes . 
- After an hour on the ground , the helicopters left with the first 61 evacuees , including all American civilians and four heads of mission . Evacuees were provided blankets on one of the flights to remain warm . Complications with the only in @-@ flight refueling on the return nearly prevented refueling , which would have forced the helicopters to divert to the Somali desert and await a rescue . At 9 : 40 , the helicopters arrived on Guam and unloaded the evacuees . 
- 
- = = = Embassy during the day = = = 
- 
- No threats came upon the embassy during the day , although truckloads of armed Somalis frequently drove by the embassy along Afghoy Road . Only one incident seemed to directly target the embassy . A sniper and a spotter were positioned on the embassy 's water tower ( the highest structure in the compound ) and came under fire ; they were ordered to not return fire and soon thereafter ordered to leave their position on the water tower . 
- The Office of Military Cooperation , just one and a half blocks from the embassy , required evacuation . Despite its proximity to the embassy , an armed convoy was needed to evacuate persons trapped there by the unrest . A convoy of vehicles with several Marines and SEALs left the embassy at 8 : 47 and returned ten minutes later with 22 persons from the OMC ( four Americans , a Filipino , and 17 Kenyans ) . This was the only excursion outside the embassy by the security detail . Throughout the day , foreign diplomats contacted the embassy desiring to be evacuated ; the US welcomed these requests , but required all of them to find their own transportation to the embassy . 
- A Somali officer who had a previous relationship with the embassy , Major Siad , agreed to travel to rescue the German chargé d 'affaires and British ambassador ( junior staff from the British embassy had previously come to the US embassy ) . The Soviet Union was unable to land a plane in Mogadishu the previous day and the Soviet ambassador asked Ambassador Bishop if he and his staff could be rescued ; Ambassador Bishop , a tennis partner of his Soviet counterpart , agreed but only if they found their own way to the embassy . Seeing the helicopters on the morning of 5 January , they realized the Americans would not remain in the city much longer . At the request of Ambassador Bishop , Major Siad agreed to transport the Soviets , but only if he was paid enough ; the US embassy paid Major Siad , who returned with the Soviet ambassador and 38 of his staff . The brother of President Barre , who was also a Major General and Chief of Police , showed up at the embassy in the afternoon with 25 members of his family requesting to be evacuated , but was turned away after a vocal conversation with the ambassador . 
- The operation did not include soldiers to handle the evacuation control center ( ECC ) , which was set up in the JAO . A 44 @-@ person force consisting primarily of soldiers to handle the ECC was planned for insertion with the CH @-@ 53E Super Stallions after they had returned to the Guam . However , this was cancelled over objections from the commander of the security detail . The deficit was partially handled by embassy staff who assisted a few soldiers from the security detail . The evacuees were grouped into 15 @-@ person " sticks " to be loaded onto the helicopters and were limited to one piece of luggage apiece . Some attempted to bring more , resulting in problems coordinating their evacuation . Furthermore , many evacuees had pets they wanted to bring , which were not allowed . Most pets were killed by their owners ; some were given poison . Meanwhile , the soldiers were allowed to consume anything they wanted from the embassy 's commissary , such as candy , sodas , and souvenirs ( most had been stationed on ships for several months ) . They were also allowed use or take anything they needed from the embassy ; the medic filled several bags with medical supplies to return to the ship . 
- As evening approached , work began to prepare the HLZ for the main evacuation . The area was used as a parking lot and several vehicles were left without keys by staff that had already been evacuated . Some cars had to be broken into to be moved . Chemical lights were placed in the HLZ in a NATO " Y " pattern . The entire mission would be conducted with night vision goggles , which required all lights in the embassy compound to be turned off . 
- 
- = = = Main evacuation = = = 
- 
- The main evacuation occurred in the early morning hours of 6 January and consisted of four waves of five CH @-@ 46 helicopters . The timing of this phase was determined by range of the CH @-@ 46 Sea Knight , which lack aerial refueling capability ; the ships were about 350 – 380 nautical miles ( 650 – 700 km ; 400 – 440 mi ) away during this phase . An AC @-@ 130 was sent from Saudi Arabia to provide gunfire support during the evacuation and two UH @-@ 1 Iroquois helicopters were on standby to provide gunfire support , but were not deployed . 
- The first wave departed Guam at 23 : 43 . As the second wave landed , Major Siad arrived at the embassy gate accompanied by two truckloads of soldiers and held a grenade in one hand and a radio in the other . His request to speak with the ambassador was granted . Major Siad demanded that the evacuation cease immediately because the Somali government had not granted the US permission to carry out such a military operation . He claimed that he would radio soldiers to shoot down the helicopters if the operation continued . The second and third waves were able to depart without incident as the ambassador negotiated with the Major , who finally agreed to settle the matter for several thousand dollars in cash and keys to the ambassador 's armored car . Ambassador Bishop remained engaged in conversation with the Major until he reached the helicopter landing zone to depart with the final wave to prevent the Major from reneging on the deal . The final wave departed the embassy at 1 : 49 and landed on Guam at 2 : 23 ; twenty minutes later , Ambassador Bishop declared the evacuation complete . 
- 
- = = = Aftermath at the embassy = = = 
- 
- Armed looters were observed entering the embassy compound as the final wave departed . The doors of the chancery — the main building of the embassy — were reportedly blown open by RPGs within two hours of the embassy 's evacuation . Somali employees of the embassy — known as foreign service nationals ( FSNs ) — could not be evacuated . Ambassador Bishop tried unsuccessfully to have these employees airlifted to safer parts of Somalia . Many of the FSNs had sought refuge in the embassy with their families and about 30 were hired as guards and protected the embassy throughout the ordeal . Local banks had been closed for some time and the embassy was unable to pay the FSNs . The Ambassador left the FSNs with keys to the commissary and warehouse on the embassy compound and they were permitted to take anything they needed . 
- 
- = = = Return to Oman = = = 
- 
- A total of 281 evacuees were taken from the embassy , including 12 heads of missions ( eight ambassadors and four chargés d 'affaires ) and 61 Americans ( including Ambassador Bishop and 36 embassy staff ) . The heads of mission were the ambassadors of the United States , Kenya , Nigeria , Soviet Union , Sudan , Turkey , United Arab Emirates , and United Kingdom and the chargés of the embassies of Germany , Kuwait , Oman , and Qatar . 
- Rather than disembark in nearby Mombasa , as originally thought by the evacuees , the ships were ordered back to Oman — a five @-@ day journey . The sailors and marines made way for the evacuees to share living quarters . When the chaplain of Guam asked crew to sign up as guides for the evacuees while aboard the vessel , two hundred signed up within an hour , and some of the sailors even dressed up as clowns to ease the ordeal for children . At the request of the ambassadors , a formal session with the ships ' senior officers was held to express their thanks . On 11 January , the evacuees were offloaded at Muscat , Oman . That afternoon , the American evacuees were flown to Frankfurt , Germany , from where they continued home . 
- 
- 
- = 2010 Claxton Shield = 
- 
- The 2010 Claxton Shield was the 57th Claxton Shield tournament , the premier baseball competition in Australia , and was held from 6 November 2009 to 7 February 2010 . It was hailed as the precursor to the new Australian Baseball League that will start in the place of the Claxton Shield in late 2010 to early 2011 . The Victoria Aces defeated South Australia two games to nil in the championship series to win the tournament ; this was the 22nd time the Claxton Shield had been awarded to a Victorian team . The competition was sponsored by Domino 's Pizza . 
- At the conclusion of the regular season , the Victoria Aces finished in first place with a 17 – 7 record , earning home @-@ field advantage for the three @-@ game championship series . South Australia hosted the three @-@ game semi @-@ final series against the New South Wales Patriots . Both teams finished with a 14 – 10 record . The Perth Heat ( 12 – 12 ) and Queensland Rams ( 3 – 21 ) both failed to qualify for the finals . 
- 
- = = Overview = = 
- 
- In June 2009 , it was announced that the rights to the Claxton Shield had been sold to a new Australian Baseball League ( ABL ) , with ownership split between Major League Baseball 's 75 percent share and the 25 percent share owned by the Australian Baseball Federation . The 2010 tournament was considered preparation for the inaugural ABL season starting in 2010 – 11 . It varied from the 2009 Claxton Shield by expanding the season to include ten rounds . Since an uneven number ( five ) teams were involved , four teams paired off for each round and played a three @-@ game series , while the remaining team took a bye . During the season , each team had two bye rounds and played two rounds against each other team , one at home and one away . In total , the schedule allowed for 24 regular @-@ season games per team before a postseason similar to the 2009 edition : the first @-@ place team directly qualified for the championship series and played against the winner of a playoff series between the second- and third @-@ place teams . 
- During the regular season , games were played on a Friday night and a doubleheader on Saturday ; in each doubleheader one of the two games was shortened to seven innings . The exception to this was when Perth played their home games ; they played on a Thursday night instead of a doubleheader on Saturday . Each postseason series was scheduled for a Friday , Saturday and Sunday . 
- 
- = = Teams = = 
- 
- 
- = = = Rosters = = = 
- 
- The 2010 series allowed each team to make use of a 19 @-@ man active roster . Exceptions were made in two cases that allowed teams ' active rosters to expand to 21 players , both times for the same reason . Two games during the season had to be postponed because of poor weather . Both games involved teams meeting for the first time during the season ; make @-@ up games were scheduled at the start of the return series between the teams , and this resulted in two four @-@ game series . In both cases , the teams had a 19 @-@ man roster for the make @-@ up game , and an expanded 21 @-@ man roster for the originally scheduled series . 
- 
- = = = Venues = = = 
- 
- The 2010 Claxton Shield was contested between five teams from around Australia . In previous years , many of the teams had played their home games at multiple venues . This season each team held their home games at only one venue . There was one scheduled exception to this at the start of the season : the New South Wales Patriots ' final home series against the Perth Heat was held at Gilchrist Oval , whereas all of their other home games were held at Blacktown Baseball Stadium . 
- As a result of poor attendance at Geelong Baseball Park , game one of the fifth @-@ round series between New South Wales and the Victoria Aces was moved to La Trobe University , Melbourne . Although the Geelong games had attracted crowds of no more than 500 , the moved game had an attendance of 2 @,@ 200 . Though no further regular season games were moved , the finals series hosted by the Aces was held at La Trobe University as well . 
- The venues are as follows : 
- 
- = = Regular season = = 
- 
- † — A game postponed from Round 7 , held in Round 8 , was played with Victoria Aces as the away team and Queensland Rams as the home team , despite being played at Geelong Baseball Park , Geelong , Victoria . 
- ‡ — A game postponed from Round 3 , held in Round 9 , was played with South Australia as the away team and Victoria Aces as the home team , despite being played at Norwood Oval , Adelaide , South Australia . 
- The Queensland Rams were the first team to be eliminated from contention for the finals , after being swept four games to nil by the Victoria Aces in round 8 . The following round saw South Australia clinch a position in the finals , despite finishing the round in second position . It was not until the final round that the last two spots in the finals were decided : the Aces clinched top spot by sweeping the Perth Heat , which combined with the New South Wales Patriots sweep of the Rams eliminated Perth from contention and secured the last finals spot for the Patriots . 
- 
- = = = Statistical leaders = = = 
- 
- 
- = = Finals series = = 
- 
- The 2010 Claxton Shield made use of the same finals structure as had been used in the 2009 season . The top three teams at the conclusion of the ten rounds of regular @-@ season games qualified . The second- and third @-@ place teams faced in each other in a best @-@ of @-@ three series hosted by the second @-@ place team . The winner of that series then faced the first @-@ place team for a best @-@ of @-@ three series . South Australia hosted the New South Wales Patriots at Norwood Oval , Adelaide , while the Victoria Aces hosted the championship series at La Trobe University , Melbourne . In the finals , the home team and away team alternated during each of the series . As a result , South Australia was officially the away team for game two of its series against New South Wales , as was Victoria in the championship series . 
- After defeating the Patriots two games to one in the semi @-@ final series , South Australia progressed to the championship series against the Aces . There they were defeated two games to nil . After game two of the championship series , Victoria 's Matthew Blackmore was named both Claxton Shield Final Series MVP and Pitcher of the Year . 
- 
- = = = Semi @-@ final series = = = 
- 
- 
- = = = Championship series = = = 
- 
- 
- = = Awards = = 
- 
- At the conclusion of the finals series , the winner of two awards were announced . Matthew Blackmore won both the Pitcher of the Year award and the Finals Series MVP award . At the Baseball Australia Diamond Awards , held on 6 March at the Hotel Grand Chancellor , Adelaide , Wayne Lundgren was announced as the 35th winner of the Helms Award ; the Claxton Shield 's Most Valuable Player award . Lundgren was the first pitcher to win since 1986 . Runners @-@ up by two votes were Paul Mildren and Michael Collins . 
- 
- 
- = Independiente ( Ricardo Arjona album ) = 
- 
- Independiente is the thirteenth Spanish @-@ language studio album by Guatemalan singer @-@ songwriter Ricardo Arjona , released on 23 September 2011 . Recorded in the United States and Mexico , it was produced by Arjona with Dan Warner , Lee Levin and Puerto Rican singer @-@ songwriter Tommy Torres . The album — the first independent release by Arjona after he was signed by Sony Music in 1993 and Warner Music in 2008 — was issued by his own label , Metamorfosis . 
- Composed and written in a year , the record marks Arjona and Torres ' fourth collaboration . For Independiente , Arjona returns to his trademark sound after his stylistic departure for Poquita Ropa ( 2010 ) . While producing the latter , he had used fewer instruments to simplify his sound , having introduced what had been called a " stripped @-@ down acoustic effort " in his music . Independiente has been compared to his earlier recordings , Historias ( 1994 ) and Animal Nocturno ( 1993 ) . 
- Independiente became Arjona 's fourth number @-@ one album on the Billboard Top Latin Albums where it debuted for the week ending 22 October 2011 . For thirteen non @-@ consecutive weeks it topped the Latin Pop Albums chart , and reached number one on the Mexican Albums Chart . It is his fifth consecutive album to chart on the Billboard 200 ( reaching number sixty @-@ five ) , and his fourth album to chart in Spain ( peaking at number sixty @-@ eight ) . Within one week after its release Independiente was certified gold in Chile , the United States and Mexico and certified platinum in Venezuela and Argentina . 
- Five singles have been released from the album . The lead single , " El Amor " , became a commercial success in several Latin American countries and was number one on the Billboard Latin Songs and Latin Pop Songs charts . It was followed by " Fuiste Tú " ( featuring Gaby Moreno ) , which reached number one on the Latin Pop Songs , number two on the Latin Songs charts and topped several other national charts . " Mi Novia Se Me Está Poniendo Vieja " was released in May 2012 ; " Te Quiero " in July 2012 , and " Si Tu No Existieras " in November 2012 . To promote Independiente , Arjona embarked on his Metamorfosis World Tour . 
- 
- = = Background = = 
- 
- In 2010 , Arjona wanted to change his musical style ; after experimenting with using as few instruments as possible , he obtained a sound similar to an a capella performance ( simplifying his sound ) and introduced what he called a " stripped @-@ down acoustic effort " to his music . This was heard on his twelfth studio album , Poquita Ropa . Arjona produced the album with Dan Warner , who has worked with Shakira , Celine Dion and Christina Aguilera . When promoting the album Arjona said , " [ songs ] are like women ; they get things up and are so concerned about this that they forget that the less clothes , more beauty . The songs are often overwhelmed by ourselves , because we saturate them with arrangements looking to exalt their qualities and we end up hiding them " . Poquita Ropa became the first album since Adentro which Arjona recorded without Torres . 
- Weeks before the release of Independiente , Arjona issued a letter raising the issue of his past relationships with recording companies . He revealed the circumstances of his first contract : " a producer , friend of mine , told them [ the record label ] that if they did not sign me in , they won 't sign two artists he had [ at that time ] " . Arjona further explained that he received the " minimum royalty percentage " from his most successful albums . Independiente is Arjona 's first independent release through his own label : Metamorfosis , a company he created to refocus his career . The company is presided by Arjona and several friends ( including photographer @-@ director Ricardo Calderón , Universal Music México executive Humberto Calderon and BMG 's Miriam Sommerz ) , and is based in Miami and Mexico City . Arjona commented that his independence represented compromise more than freedom , stating that " Inside the word ' Independent ' , even when it sounds like extreme freedom , there 's a big amount of compromise and the responsibility of being able to administrate , in the best way possible , such independence " . Billboard notes that , although other groups have released independent albums following contracts with major labels , Arjona is the most important Latin pop artist to do so . Although the album is marketed within the new label , distribution was handled by Warner Music . 
- 
- = = Production and recording = = 
- 
- Independiente marked Arjona 's fourth collaboration with Torres . The latter was a composer and producer , also receiving background @-@ vocal credit . The musicians first worked together in 2005 , when Arjona released his tenth studio album ( Adentro ) . He stated that he first " tested " Torres by sending him the " hookiest and darkest tracks " on the album : " Acompañame A Estar Solo " and " Iluso " . Torres then " went all out on the first demo , hiring a full band that included a string orchestra " . In Quién Dijo Ayer ( 2007 ) , Torres produced the singles " Quién " and " Quiero " and provided background vocals on the remastered versions of Arjona 's past hits . In 5to Piso ( 2008 ) , Torres produced several tracks ; one was the lead single " Como Duele " , considered Arjona 's " biggest hit in years " by Jason Birchmeier of Allmusic . 
- The album was composed over a one @-@ year period . Most of its production was handled by three producers familiar with Arjona 's work : Dan Warner , Lee Levin and Dan Rudin . Tommy Torres also produced three tracks : the lead single " El Amor " , second single " Fuiste Tú " and " Hay Amores " . Victor Patrón produced two songs , ( " Caudillo " and the piano version of " Mi Novia Se Me Está Poniendo Vieja " ) and Julio Chávez aided in the production of " Reconciliación " . Arjona wrote all the songs except " El Amor " ( which was co @-@ written with Torres ) . The album was recorded and produced in several cities in the United States and Mexico . Independiente was mixed at the Blue Grotto in Nashville , Tennessee , and mastered by Tom Coyne and Aya Merrill at Sterling Sound in New York City . With Torres ' return to producing Arjona regained the classic , trademark sound which Torres helped develop since 2005 . 
- 
- = = Composition = = 
- 
- Independiente opens with " Lo Que Está Bien Está Mal " , a Latin pop song and the only track composed by Dan Warner instead of Arjona ( who wrote the lyrics ) . " El Amor " was motivated by Arjona 's desire to examine " those big , dark events within love that nobody talks about " ; he continued , " [ the ] dark sides of love are extremely fundamental to understand its great value . " Arjona added , " So many good things abould love has been shown that somebody had to turn it around and tell the bad ones " . In a February 2012 interview , Arjona stated that " El Amor " was the " most tawdry " song he had released to date , explaining that their choice of the song was a " contradiction " because it was not " the song which could better represent the entire album " . He described it as " very strong " and " a bit dark " . The single marked Arjona 's return to his signature , mainstream sound after the Cuban music influenced Poquita Ropa 's lead single " Puente " , a mixture of salsa and merengue which failed to make an impact in the United States . 
- The album includes " Fuiste Tú " , a duet with Guatemalan singer Gaby Moreno . Its instrumentation consists of piano , violin , guitars , drums and other percussion . Although Arjona stated that he " had the possibilities to record this song with very well known people " he expressed his happiness with Moreno , revealing that " the possibilities of doing it with her , for me , are a celebration " . He described Moreno as " incredibly talented " , a " countrywoman " and a " fantastic human being " . Arjona named " Fuiste Tú " as one of the most important songs on the album . " Mi Novia Se Me Está Poniendo Vieja " took two years to complete and Arjona dedicated it to his mother , Noemí Morales . He stated that he wrote it " as a gift for my mom on Mother 's Day " and that he thought " the idea of including it on the album was very good " . As with his single " Señora De Las Cuatro Decadas " ( on 1994 's Historias ) , at first he never thought to include the song on an album . " Caudillo " evokes " the image of some friends " Arjona had at college ; he asserted that he " appears constantly there because sometimes we transform ourselves into a contradiction of all those things we fought in those moments . It 's the history of a student leader that becomes a president " . Arjona dedicated the album to his father , who died in 2011 . 
- 
- = = Release and promotion = = 
- 
- Independiente was first digitally released in some South American countries on 23 September 2011 as a special edition , dubbed the Cono Sur Edition . This version included a different mix of " Reconciliación " . On 30 September , the digital download for the standard edition of the album was released in several Latin American and European countries . On 4 October , the album was officially released as a digital download and compact disc in most of these same markets as well as North America ; an iTunes edition was released as a digital download on the iTunes music store . This version included an album @-@ only video , entitled " Independiente " . In Germany , the album was first available on the Kiwi label on 4 October and on 11 October through Warner Music . In Canada and Spain , the compact @-@ disc version of the album was available on 25 October . 
- Arjona appeared on a television special in 2011 to promote Independiente . The special featured guest appearances by Gaby Moreno , Ricky Muñoz ( of the Mexican band Intocable ) and Paquita la del Barrio . Broadcast by Televisa , the program showcased the fourteen songs on Independiente . Muñoz said that he was " happy to do things for Ricardo [ Arjona ] " , elaborating that they met each other " some time ago " and it was " a very special situation " . The show was later rebroadcast on 5 November by Canal de las Estrellas . 
- 
- = = Singles = = 
- 
- The first single from Independiente is " El Amor " , released on 23 August 2011 . In the United States it reached number one on the Billboard Top Latin Songs chart ( Arjona 's fourth number one on that chart , following " Desnuda " , " Cuando " and " El Problema " ) and number one on the Billboard Latin Pop Songs chart . It was also a hit in Latin America , reaching number one in Argentina , Mexico , Colombia , Venezuela , Chile , Costa Rica , Panama and Guatemala . The music video for " El Amor " , filmed in black @-@ and @-@ white , was released on 8 September 2011 . It was directed by Ricardo Calderón ( who also directed Arjona 's music video for " Como Duele " ) and filmed in Mexico City . The second single from the album is " Fuiste Tú " , a duet with Guatemalan singer Gaby Moreno . The music video for the song was shot in Guatemala ( around the tropical areas of Antigua Guatemala , Río Dulce , the Atitlán lake , Semuc Champey and the Tikal ruins ) and directed by Argentine director Joaquín Cambre . Arjona commented that " this video recreates the battle on the couple when someone starts to talk ' is the beginning of the end ' " . " Fuiste Tú " reached number two on the Billboard Top Latin Songs and number one on the Latin Pop Songs charts . 
- " Mi Novia Se Me Está Poniendo Vieja " was released as the third single . Arjona wrote the song for his mother , Noemí Morales . The music video for the song , released in April 2012 , was filmed at Universal Studios in Los Angeles . It features Arjona and his son , Ricardo Arjona Jr . , and was directed by Robert García . The song was used by American telecommunications corporation AT & T for a Nokia Lumia 900 smartphone commercial featuring Arjona and was released in music stores in May 2012 . The fourth single from the album , " Te Quiero " , was released in July 2012 . The music video for the song was filmed during Arjona 's concerts at Vélez Stadium in Buenos Aires , Argentina during his Metamorfosis World Tour . It marks Arjona 's first music video taken from a live performance . The song reached number ten in Mexico and number one on both the Billboard Latin Songs and Latin Pop Songs chart . " Si Tu No Existieras " was released in November 2012 as the set 's fifth and final single , and was intended to promote the re @-@ release of the album . The song , which music video was similar to that of " Te Quiero " , managed to peak at number 14 in Mexico . 
- 
- = = Tour = = 
- 
- Beginning on 27 January 2012 in Toluca , Mexico , Arjona embarked on a world tour to promote the album . The Metamorfosis World Tour was announced in December 2011 , and visited the Americas . The show consisted of four theatrical sets on a revolving stage ; Arjona performed on each in turn , as it relateed to each song . Fellow Guatemalan singer @-@ songwriter Gaby Moreno appeared in several performances , joining Arjona for " Fuiste Tú " . The tour was praised by critics and fans . Natalie Torres of Dia a Dia reported , " Arjona knows how to handle his ' girls ' , with a mix of attitudes from a ' rough ' male and seductive lyrics " . 
- Jon Pareles of The New York Times commented that " Arjona is one of Latin pop ’ s finest lyricists : observant , nuanced , sometimes wry , sometimes melancholy and especially fond of the play of opposites " . He added , " unlike some of his fellow Latin pop stars , Mr. Arjona is no saccharine lover boy " . The tour broke records for ticket sales , commercial gross and attendance . In Buenos Aires it was the most popular show at Velez Stadium , with a total attendance of more than 160 @,@ 000 for four consecutive sold @-@ out concerts . In Guatemala City Arjona was the first artist with two consecutive sold @-@ out concerts at Mateo Flores Stadium , with a combined attendance of more than 50 @,@ 000 . As of October 2012 , the tour has been performed for close to one million people in more than eight countries . 
- 
- = = Commercial performance = = 
- 
- Independiente debuted at the top of the Billboard Top Latin Albums for the week ending 22 October 2011 , and remained at that position the following week . It was the third album by Arjona to remain for more than a week at number one , after Galería Caribe ( 2000 ) and 5to Piso . Independiente became his fourth chart @-@ topper , following Poquita Ropa ( 2010 ) . For its third week it fell to number two , replaced by Chino & Nacho 's Supremo . The album also debuted at number one on the Latin Pop Albums chart for the week ending 22 October , becoming Arjona 's fifth album to do so . It remained at number one the following week ; for its third week , it was replaced by Supremo . The album reached number one again for the week ending 12 November , and later for the week ending 11 February 2012 . For its second run it remained three weeks at the top before being replaced by Maná 's Drama y Luz for a week ; for its third run at number one , it remained at the top spot for five weeks . For the week ending 2 June 2012 , Independiente returned again to number one . 
- For the week it debuted atop both the Latin Albums and Latin Pop Albums charts , Independiente also appeared as number 65 on the Billboard 200 . It is Arjona 's fifth consecutive album to chart on that list ( following Adentro , Quién Dijo Ayer , 5to Piso and Poquita Ropa ) , although it has only charted higher than Adentro . In Mexico , Independiente debuted at number one for the week ending 9 October 2011 . The following week it fell to number two , replaced at the top by Espinoza Paz ' Canciones Que Duelen . For its third week , the album fell to number three . In Argentina , Independiente debuted at number one for the week ending 9 October 2011 ; it remained at the top position for a single week , dropping to number five the following week . The album also charted on Spain , reaching number 76 . The following week it fell off the chart but later re @-@ entered , reaching its peak at number 68 . Independiente is Arjona 's fourth album to chart in Spain , following Adentro , 5to Piso and Poquita Ropa . On the 2011 year @-@ end charts , Independiente was the 50th best @-@ selling album on the Latin Albums chart and the 15th best @-@ seller on the Latin Pop Albums chart . In Mexico , it was the 19th best @-@ selling album of 2011 . 
- Independiente was certified platinum by the Argentine Chamber of Phonograms and Videograms Producers in recognition of 40 @,@ 000 copies sold . It was also certified gold and platinum by the Mexican Association of Producers of Phonograms and Videograms for 90 @,@ 000 copies shipped . In the United States , Independiente was certified Latin platinum by the Recording Industry Association of America for 100 @,@ 000 copies shipped . In Venezuela , the album was certified double platinum for more than 40 @,@ 000 copies sold . It was certified gold in Chile for 5 @,@ 000 copies shipped , and in Colombia for 10 @,@ 000 copies sold . As of November 2012 , Independiente has sold 75 @,@ 000 copies in the United States . 
- 
- = = Critical reaction and awards = = 
- 
- David Jeffries of Allmusic gave the album a mildly positive review , citing Arjona 's return to his more mainstream style after the " stripped @-@ down acoustic effort " Poquita Ropa . He compared ( as did Arjona and other critics ) the production values and musical style of Independiente with past albums Animal Nocturno and Historias . Finally , he stated that " Returning fans will revel in this combination of freedom and growth , and appreciate the return of producer Tommy Torres , the man who has been behind the boards for quite a few of Arjona 's most popular releases " ( referring to Torres ' absence from the production of Poquita Ropa ) . 
- A contributor to the Colombian website CMI commented that " listening to Independiente is a labyrinth to go through , each song is a huge path that seems to have no end , because it involves imagination , it invites you to dream , to charm , to bewitch . But neither leaves behind the problematic requirements of love , its loopholes , hideouts and concerns , as well as its bad times in this joke that 's life " . Independiente was nominated at the Premios Juventud of 2012 for the " Lo Toco Todo ( I Play It All ) " award . On 25 September 2012 , the album received two nominations at the 13th Annual Latin Grammy Awards : Album of the Year and Best Singer @-@ songwriter album . On 3 December 2012 , Independiente received a nomination for " Pop Album of the Year " at the 2013 Premios Lo Nuestro awards . It also received a nomination for Grammy Award for Best Latin Pop Album at the Grammy Awards of 2013 . Arjona does not win the latter award since 2005 with Adentro in a shared win with Mexican singer Julieta Venegas . In February 2013 , Independiente received a nomination for " Latin Pop Album of the Year " at the Billboard Latin Music Awards of 2013 . 
- 
- = = Track listing = = 
- 
- All songs written and composed by Ricardo Arjona , except where noted . 
- 
- = = Personnel = = 
- 
- Credits are taken from Independiente liner notes . 
- 
- = = Chart performance = = 
- 
- 
- = = Certifications = = 
- 
- 
- = = Release history = = 
- 
- 
- 
- = 2003 Pacific typhoon season = 
- 
- The 2003 Pacific typhoon season was a slightly below average yearlong period of tropical cyclogenesis exhibiting the development of 31 tropical depressions , of which 21 became named storms ; of those , 14 became typhoons . Though every month with the exception of February and March featured tropical activity , most storms developed from May through October . During the season , tropical cyclones affected the Philippines , Japan , China , the Korean Peninsula , Indochina , and various islands in the western Pacific . 
- The season ran year @-@ round , with the first storm , Yanyan , developing west of the Marshall Islands on January 15 . In April , Typhoon Kujira became one of the longest @-@ lasting Pacific typhoons in history and attained climatological records for its unusually early impacts . Typhoon Imbudo in July caused several deaths and extensive damage across the Philippines and China . In September , Typhoon Maemi became one of the costliest typhoons in recorded history after striking South Korea ; Maemi was also the most intense tropical cyclone of the season with a minimum barometric pressure of 910 mbar ( hPa ; 26 @.@ 87 inHg ) . In late November , Typhoon Lupit devastated areas of Yap State in the Federated States of Micronesia . The season closed with the dissipation of a tropical depression east of the Philippines on December 27 . 
- The scope of this article is limited to the Pacific Ocean , north of the equator and west of the International Date Line . Storms that form east of the date line and north of the equator are called hurricanes ; see 2003 Pacific hurricane season . Tropical Storms formed in the entire west Pacific basin are assigned a name by the Tokyo Typhoon Center . Tropical depressions in this basin monitored by the Joint Typhoon Warning Center ( JTWC ) have the " W " suffix added to their number . Tropical depressions that enter or form in the Philippine area of responsibility are assigned a name by the Philippine Atmospheric , Geophysical and Astronomical Services Administration or PAGASA . This can often result in the same storm having two names . 
- 
- = = Seasonal forecasts = = 
- 
- On March 5 , 2003 , meteorologists from the University College London at the Tropical Storm Risk ( TSR ) Consortium issued an extended range forecast for the typhoon season , noting the likelihood of near average tropical cyclone activity as a result of projected neutral sea surface temperatures . The forecast indicated the potential for 26 @.@ 2 tropical storms , compared to the 10 – and 30 @-@ year average of 27 @.@ 8 and 26 @.@ 3 storms , respectively . The following month , the group raised their forecast for tropical storms to 26 @.@ 7 , indicating a slightly above average season . Over next two months , however , fluctuations in sea surface temperatures , particularly those in the Central Pacific , caused the group to revise their predictions downward and indicated the probability for a slightly below average typhoon season in their June forecast . A rise in sea surface temperatures in the following months prompted the forecasting group to once again raise their forecasts to indicate a near @-@ average season in their final August forecast update , which predicted 27 tropical storms . The group was very accurate in their forecasts , with their April and August forecasts being the most accurate . 
- Similarly , meteorologists working with the City University of Hong Kong issued a seasonal projection on April 24 , 2003 , indicating the likelihood of a normal or below normal season with 29 total tropical cyclones , 26 tropical storms , and 16 typhoons . As with the TSR , the group primarily based their forecast numbers on the prevailing status of the El Niño @-@ Southern Oscillation . The City University of Hong Kong revised their forecasts on June 24 , 2003 , indicating a slight increase of total tropical cyclones to 30 . The group was also accurate in their forecasts for the entirety of the Northwest Pacific , though their specialized forecasts for the South China Sea were substantially off . 
- During the year , the Japan Meteorological Agency ( JMA ) issued advisories on tropical cyclones west of the International Date Line to the Malay Peninsula , and north of the equator ; this was due to the agency 's status as the official Regional Specialized Meteorological Center , as designated by the World Meteorological Organization in 1989 . The JMA issued forecasts and analyses four times a day , beginning at 0000 UTC and continuing every six hours . The JMA issued forecasts based on a climatological tropical cyclone forecast model . The agency estimated 10 minute sustained winds and barometric pressure based on the Dvorak technique and numerical weather prediction . The JTWC also issued warnings on storms within the basin , operating from Pearl Harbor in Hawaii to represent the interests of the United States Armed Forces in the Indian and Pacific Oceans . 
- 
- = = Season summary = = 
- 
- Throughout the season , sea surface temperatures within the western equatorial Pacific were above normal , including those in the South China Sea . Areas of convection persisted year @-@ round in the lower latitudes , particularly around the Philippines . Atmospheric divergence was also prevalent in the same regions , resulting in enhanced tropical cyclogenesis east of the Philippines in 2003 ; the mean region of development of tropical systems during the year was more southwest than the 1971 – 2000 30 @-@ year average . In 2003 , the JMA monitored 21 tropical cyclones of at least tropical storm intensity ; of those , 14 reached typhoon intensity . Though the number of tropical storms was below average , the ratio between tropical storms and typhoons was 66 % greater than normal . The Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) monitored three additional cyclones of at least tropical storm intensity that were not monitored by the JMA . 
- The season began with the formation of Tropical Storm Yanyan on January 15 . After its dissipation five days later , no tropical cyclones of at least tropical storm intensity developed over the next two months . This period of inactivity ended with the formation of Typhoon Kujira in mid @-@ April ; Kujira was one of the longest lived Pacific storms on record and was the first typhoon with 1 @-@ minute sustained winds of at least 240 km / h ( 150 mph ) in April since Typhoon Isa in 1997 . Tropical activity was enhanced from May to June , and during this period the JMA monitored four tropical storms , while the PAGASA monitored a fifth storm off the eastern Philippines . Three of the four tropical storms monitored by the JMA approached or hit Japan , including Typhoon Soudelor , which brought heavy rainfall and wind across the Ryukyu Islands and the Korean Peninsula . 
- Tropical activity once again declined towards the second half of June and first half of July . The second half of July , however , featured the development of typhoons Imbudo and Koni , which both tracked westward across the Philippines before striking areas near Saipan and other regions of southeastern China . Imbudo caused the deaths of 78 people and US $ 383 million in damage . August was a highly active month for tropical cyclogenesis , with a total of six tropical storms monitored by the JMA , JTWC , and PAGASA . This included typhoons Krovanh and Dujuan , which also struck southeastern China . Typhoon Etau earlier in the month made landfall in Japan , resulting in 17 deaths . 
- Activity was somewhat below average in September , with only one tropical cyclone making landfall , Maemi . However , Maemi was the strongest tropical cyclone of the season and was the costliest with roughly US $ 4 @.@ 8 billion in damage , mostly in South Korea . Tropical cyclogenesis and activity continued to decline after August , with October featuring only three tropical storms . However , two , Ketsana and Parma , reached typhoon intensity ; both stayed away from land . November featured less storms but was climatologically average , with two typhoons developing . The second typhoon , Lupit , devastated portions of Yap State , resulting in approximately $ 1 @.@ 7 million in damage . In December , the JTWC and PAGASA monitored a sole tropical system east of the Philippines , though the JMA did not monitor or classify any tropical cyclones during the month . 
- 
- = = Storms = = 
- 
- In storm information below , wind @-@ speed advisories differ from the Joint Typhoon Warning Center ( JTWC ) to the JMA as the JTWC uses the United States criteria of 1 @-@ minute mean to designate maximum sustained winds , while the JMA uses the 10 @-@ minute mean wind criteria to designate tropical cyclone maximum sustained winds . This difference generally results in JTWC maximum winds appearing higher than the maximum winds described by the JMA for the same cyclone . 
- 
- = = = Tropical Storm Yanyan = = = 
- 
- On January 11 , the JTWC began monitoring the disturbance that would eventually develop into Yanyan near the International Date Line . As the system tracked westward , it gradually moved into a more favorable environment for tropical cyclogenesis . On January 14 , surface observations indicated that the low @-@ pressure area had developed a closed , low @-@ level circulation center indicative of a tropical cyclone , satellite imagery remained inconclusive . Nonetheless , the JMA classified the pressure area as a tropical depression west of the Marshall Islands at 0600 UTC on January 15 . The JTWC would follow suit by classifying the storm as such at 1800 UTC later that day . At the time , the depression was tracking west @-@ northwest under the influence of a subtropical ridge to the north . Over the next day the system waned in convective activity before resuming its previous track and accelerating . At 0000 UTC on January 17 , the JTWC upgraded the system to tropical storm intensity , though the storm remained nameless as the JMA continued to classify it as a tropical depression . 
- Throughout January 17 the tropical storm would again oscillate in strength , resulting in a brief downgrade by the JTWC to tropical depression intensity . However , an increase in deep convection resulted in its reclassification as a tropical storm at 1800 UTC that day , followed by the JMA upgrading the system to tropical storm intensity at 1200 UTC on January 18 . As such , the storm received the name Yanyan . At roughly the same time , the tropical cyclone began to stall east of the Mariana Islands and curve sharply northeastward . Whilst the JTWC indicated that Yanyan peaked in strength late on January 18 with 1 @-@ minute sustained winds of 60 km / h ( 37 mph ) , the JMA considered the system to have maintained the same intensity throughout its stint as a tropical storm . Steered by the same nearby subtropical ridge , Yanyan would continue to track towards the northeast into a less favorable tropical cyclone environment . The JMA downgraded Yanyan to tropical depression at 1200 UTC on January 20 before the storm transitioned into an extratropical cyclone as its low @-@ level circulation center decoupled from the primary mass of convection due to strong wind shear . At 0000 UTC , both the JTWC and JMA discontinued the monitoring of Yanyan . 
- 
- = = = Typhoon Kujira ( Amang ) = = = 
- 
- Kujira developed from a broad area of disturbed weather as a tropical depression on April 9 well removed from any landmasses . Shortly after development , Kujira quickly intensified in its early stages , and was upgraded to a tropical storm just two days after cyclogenesis . Strengthening slowed afterwards , though the storm attained typhoon intensity on April 14 . Intensification continued and late on April 15 , Kujira reached its peak intensity with winds of 165 km / h ( 105 mph ) and a minimum barometric pressure of 930 mbar ( hPa ; 27 @.@ 46 inHg ) . Following peak intensity , Kujira would begin to track northwest and oscillate in strength , cresting an additional two times in intensity . On April 21 , the typhoon was downgraded to tropical storm intensity and began to track erratically for several days east of Taiwan . However , on April 24 , Kujira would resume a northward track and begin to weaken , and on April 24 was downgraded to tropical depression strength as it made landfall on Kyushu . Following landfall , Kujira transitioned into an extratropical cyclone on April 25 , which persisted until crossing the International Dateline towards the end of April 2003 . 
- Shortly after developing , Kujira caused two fatalities in Pohnpei in addition to minor agricultural and infrastructural damage ; similar effects were felt in Guam . Several days later , the typhoon prompted cyclone warnings and other precautionary measures in the Philippines after forecasts indicated the potential for strong winds and rain . However , ultimately any effects in the archipelago associated with Kujira remained minimal . The typhoon also prompted warning products in Taiwan , making it the first April typhoon since 1978 to cause such a feat . Unlike in the Philippines , however , Kujira would bring significant rainfall to Taiwan . Effects from the typhoon were most significant in Japan , particularly in the Ryukyu Islands . Strong winds , rain , and waves caused US $ 230 @,@ 000 ( ¥ 27 @.@ 8 million ) in agricultural damage on Ishigaki Island . One person was killed due to injuries resulting from the waves . In Kyushu , heavy rainfall , peaking at 196 mm ( 7 @.@ 7 in ) in Ōita Prefecture , was reported . Overall , despite its distance away from land and weak intensity at the time of its sole landfall , Kujira resulted in three fatalities . 
- 
- = = = Typhoon Chan @-@ hom = = = 
- 
- Midday on May 18 , the JTWC began to monitor an area of persistent disturbed weather associated with a broad low @-@ pressure area southwest of Chuuk . Within highly conductive conditions , the disturbance quickly organized and became classified as a tropical depression at 0000 UTC the following day . In its initial stages , the depression tracked slowly northeastwards . However , a shortwave trough forced a weakness in a nearby ridge , allowing for the storm to take a more streamlined , northward path . At 1200 UTC on May 20 , the JMA upgraded the depression to Tropical Storm Chan @-@ hom . Following the system 's naming , Chan @-@ hom temporarily meandered towards the northwest before resuming its northeasterly track . The next day , the storm began to develop an eye ; this was reflected with an upgrade by the JMA to typhoon status at 0600 UTC on May 23 . Gradual intensification followed , and at 1800 UTC that day Chan @-@ hom reached its peak intensity with maximum sustained winds of 155 km / h ( 100 mph ) and a minimum pressure of 940 mbar ( hPa ; 27 @.@ 76 inHg ) . 
- Following peak intensity , Chan @-@ hom began to intake dry air beginning on May 25 . At roughly the same time , the typhoon began to weaken and accelerate towards the northeast . Conditions continued to worsen as the storm moved further north , and as the cyclone passed east of Minamitorishima , it was downgraded to tropical storm classification . By this time , Chan @-@ hom had lost much of its convection due to wind shear . Early on May 27 , Chan @-@ hom had fully transitioned into an extratropical cyclone , and these remnants continued to track towards the northeast . These extratropical remnants dissipated south of the Aleutian Islands the following day . Early in the typhoon 's existence , Chan @-@ hom posed a potential threat to Guam , but remained well east of the island . However , after passing to the northeast , winds from the typhoon fanned volcanic ash from the recently erupting Anatahan volcano towards the island , prompting precautionary measures in Guam . Ashfalls were reported on the island , forcing the cancellation of several flights . As a tropical storm , Chan @-@ hom caused some damage to homes and crops on Chuuk , mostly due to heavy rains brought forth by the storm . Offshore , a 1 @,@ 040 ton fishing vessel , the Nien Feioch , sank during the storm . The ship was valued at $ 16 million . 
- 
- = = = Severe Tropical Storm Linfa ( Chedeng ) = = = 
- 
- Tropical Storm Linfa developed as a tropical depression just off the western coast of Luzon on May 25 . The disturbance quickly intensified to reach tropical storm intensity a few hours after cyclogenesis . However , intensification leveled off as Linfa executed a small clockwise loop before a subsequent landfall on Luzon on May 27 . Due to land interaction the storm temporarily weakened and decoupled before reforming in the Philippine Sea . Afterwards Linfa began reintensifying and reached its peak intensity on May 29 with maximum sustained winds of 100 km / h ( 65 mph ) and a barometric pressure of 980 mbar ( hPa ; 28 @.@ 94 inHg ) . Following its peak the tropical storm began to deteriorate and transitioned into an extratropical cyclone on May 30 ; these extratropical remnants continued to track northward through Japan before dissipating in the Sea of Okhotsk on June 4 . 
- The erratic and slow movement of Linfa off the western Philippines was the catalyst for extreme rainfall and flooding , killing 41 persons in the archipelago . Precipitation peaked at 723 mm ( 28 @.@ 5 in ) near Dagupan . Rising floodwaters resulted in the temporary shutdown of government offices and numerous mudslides . In addition , strong winds caused widespread power outages . Overall damage from Linfa in the Philippines amounted to ₱ 192 @.@ 3 million ( US $ 3 @.@ 65 million ) . The floods also displaced 8 @,@ 367 people in 1 @,@ 686 families and destroyed 178 homes . Linfa and its extratropical remnants later brought torrential rainfall and widespread flooding to Japan , particularly in southwestern regions . Rainfall there peaked at 727 mm ( 28 @.@ 62 in ) . Flood damage was worst in Kōchi and Tokushima Prefectures , where several buildings were destroyed by floodwater . Other locations in Japan experienced considerable agricultural damage as well as numerous landslides . Overall , Linfa caused roughly $ 28 @.@ 2 million in damage , much of which occurred in Japan , though the entirety of deaths associated with the cyclone took place in the Philippines . 
- 
- = = = Severe Tropical Storm Nangka ( Dodong ) = = = 
- 
- In late May , an area of disturbed weather began to persist in the South China Sea . The JTWC began to monitor the storm cluster on May 29 . The following day , the JMA reclassified the system as a tropical depression ; initially the system remained highly disorganized due to the lack of deep convection . Persistent moderate wind shear and dry air prevented the cyclone from strengthening significantly in the storm 's early stages . These conditions abated as the depression tracked northeast , and at 0000 UTC on June 1 , the JMA upgraded the system to Tropical Storm Nangka . Throughout the course of the day , Nangka continued to strengthen as it accelerated northeast , and peaked in strength with a barometric pressure of 985 mbar ( hPa ; 29 @.@ 09 inHg ) and maximum sustained winds of 95 km / h ( 60 mph ) , making it a severe tropical storm . 
- However , upon moving through the Bashi Channel , conditions began to deteriorate due to increased wind shear , weakening the system and resulting in its downgrade to tropical depression status by the JMA at 1200 UTC on June 3 . Nangka continued to become increasingly disorganized as it moved further north , and late that day , the depression transitioned to an extratropical cyclone . The resulting remnants continued to track well east of Japan before dissipating on June 7 . Due to its track away from landmasses , damage remained minimal ; however , as Nangka passed to the south and east of Japan , the storm brought light rainfall to the country , peaking at 81 mm ( 3 @.@ 2 in ) in Minamidaitō , Okinawa . 
- 
- = = = Typhoon Soudelor ( Egay ) = = = 
- 
- A tropical disturbance persisted in the monsoon trough northwest of Pohnpei on June 7 , and moved westward without development due to wind shear . On June 11 , the shear decreased enough to allow the convection to organize , and the next day the JMA classified it as a tropical depression northeast of Palau . On June 13 , the JMA upgraded it to Tropical Storm Soudelor to the east of the Philippines , and PAGASA gave it the local name " Egay " . Soudelor moved to the northwest and later to the north , parallel to the eastern Philippines , and on June 17 , the JMA upgraded it to typhoon status . The storm rapidly intensified to the east of Taiwan as it developed a well @-@ defined eye , and while doing so passed over the Japanese island of Iriomote @-@ jima at around 2030 UTC on June 17 . At 0600 UTC on June 18 , the JTWC estimated peak 1 minute winds of 215 km / h ( 135 mph ) , while the JMA estimated peak 10 minute winds of 150 km / h ( 90 mph ) . Increased shear weakened the typhoon to tropical storm strengthen on June 19 , and later that day the JMA declared the storm as extratropical near the Oki Islands . The extratropical remnants of Soudelor continued to the northeast , crossing northern Japan on June 20 and dissipating on June 24 . 
- While offshore the Philippines , Soudelor dropped heavy rainfall that caused flooding and left thousands homeless . The storm caused ₱ 131 million ( PHP , $ 2 @.@ 46 million USD ) in damage and 12 deaths . On the Japanese island of Iriomote @-@ jima , where wind gusts reached 204 km / h ( 127 mph ) . It also affected Taiwan , where floods covered highways and caused mudslides . In Japan , the storm caused widespread power outages , although damage was minimal , and there were 21 injuries . In South Korea , there was $ 12 @.@ 1 million in damage and two deaths . 
- 
- = = = Typhoon Imbudo ( Harurot ) = = = 
- 
- On July 15 , the JMA estimated that a tropical depression formed , and the next day the JTWC initiated advisories on Tropical Depression 09W about 665 km ( 415 mi ) east of Yap . A subtropical ridge near Okinawa steered the nascent depression to the west @-@ northwest for much of its duration . With warm waters and favorable upper @-@ level conditions , the depression quickly organized , first to Tropical Storm Imbudo on July 17 , and to typhoon status two days later , when PAGASA begin issuing advisories on Typhoon Harurot . Around that time , Imbudo was rapidly intensifying , developing a well @-@ defined eye . At 1200 UTC on July 20 , the JMA estimated peak 10 minute sustained winds of 165 km / h ( 105 mph ) , and the same time , the JTWC estimated 1 minute sustained winds of 240 km / h ( 150 mph ) , making it a super typhoon . Imbudo maintained peak winds for about 12 hours , before undergoing an eyewall replacement cycle . At 0300 UTC on July 22 , Imbudo struck northern Luzon , with 1 minute winds estimated at 205 km / h ( 125 mph ) by the JTWC . It weakened over land , but re @-@ intensified in the South China Sea , striking southern China near Yangjiang , Guangdong on July 24 . Imbudo rapidly weakened , dissipating on July 25 . 
- In the Philippines , officials evacuated over 14 @,@ 000 people . Imbudo was the strongest typhoon to strike since Typhoon Zeb five years prior , The typhoon left widespread areas flooded for several days . Damage was heaviest in the Cagayan Valley , where over 80 @,@ 000 people were displaced by the storm . In Isabela , high winds wrecked most of the banana crop and severely damaged other crops . Throughout the Philippines , Imbudo damaged or destroyed 62 @,@ 314 houses , causing P4.7 billion ( 2003 PHP , $ 86 million 2003 USD ) in damage . There were 64 deaths in the country . In southern China in Yangjiang , more than 30 @,@ 000 people evacuated ahead of the storm , and more than half of the trees in the city fell due to strong winds . High winds killed a man in Hong Kong after knocking him off a platform . Throughout Guangdong , Imbudo destroyed 595 @,@ 000 houses and caused eight deaths . Heavy rains spread across southern China , peaking at 343 mm ( 13 @.@ 5 in ) at Hepu County in Guangxi province . There , 12 people died from the storm . Overall damage in China was about ¥ 4 @.@ 45 billion ( CNY , $ 297 million USD ) . 
- 
- = = = Severe Tropical Storm Koni ( Gilas ) = = = 
- 
- Koni originated from a tropical depression situated within the monsoon trough to the east of the Philippines on July 15 . Tracking westward , intensification was slow and the system remained a tropical depression as it moved across the central Philippines on July 17 . Upon moving into the South China Sea , conditions allowed for quicker strengthening , and the cyclone reached tropical storm status on July 18 before reaching its peak intensity with maximum sustained winds of 110 km / h ( 70 mph ) , making it a severe tropical storm . However , atmospheric conditions began to deteriorate as Koni made landfall on Hainan on July 21 , weakening the system . The tropical storm continued to weaken as it moved over the Gulf of Tonkin prior to a final landfall near Hanoi , Vietnam the following day . Tracking inland , the combination of land interaction and wind shear caused Koni to dissipate over Laos on July 23 . 
- Shortly after development , Koni tracked through the Philippines , killing two people . After moving into the South China Sea , turbulence produced by the storm resulted in an aviation incident involving a commercial airliner off the western Philippines . Three of the plane 's occupants received minor injuries . In Hainan , Koni caused heavy rainfall , peaking at 189 mm ( 7 @.@ 44 in ) at a station on Wuzhi Mountain . The rains resulted in the collapse of 1 @,@ 400 homes and an estimated CN ¥ 140 @.@ 27 million ( US $ 16 @.@ 9 million ) in direct economic losses . Effects were worst in Vietnam , where three people were killed . Widespread power outages occurred , and strong winds resulted in agricultural and infrastructural damage , particularly in Vietnam 's northern provinces . 
- 
- = = = Tropical Storm Morakot ( Juaning ) = = = 
- 
- Morakot spawned from an area of disturbed weather in the Philippine Sea on July 31 . Tracking northwest , favorable conditions allowed for the intensification of the system to tropical storm strength on August 2 . Morakot reached peak intensity later that day with winds of 85 km / h ( 50 mph ) and a minimum barometric pressure of 992 mbar ( hPa ; 28 @.@ 29 inHg ) . This intensity was held for several hours until less conducive atmospheric conditions slightly weakened the system ; this was followed by Morakot making landfall on southern Taiwan on August 3 . Subsequently , the storm weakened and moved into the Taiwan Strait before making its final landfall near Quanzhou , China the next day . The storm quickly weakened over the Chinese mainland , and dissipated entirely several hours after landfall . 
- In Taiwan , where Morakot first made landfall , heavy rainfall resulted in flooding . Commercial flights , schools , and rail service in some areas was cancelled in advance of the storm . Precipitation there peaked at 653 mm ( 25 @.@ 71 in ) over a period of nearly two days in Taitung County . Crop damage also resulted from the rainfall , and was estimated at over NT $ 70 million ( US $ 2 million ) . In China , record rainfall was reported . The worst impacted city was Quanzhou , where losses due to Morakot reached CN ¥ 240 million ( US $ 29 million ) and one death was reported . Power outages were also widespread across southeastern China . Due to preexisting drought conditions , 703 cloud seeding operations took place in order to artificially generate added rainfall ; such operations resulted in moderate precipitation over the targeted area . Overall , Morakot caused roughly $ 31 million in damage and three deaths . 
- 
- = = = Typhoon Etau ( Kabayan ) = = = 
- 
- A tropical depression developed on August 2 southeast of Guam , and gradually intensified while moving to the northwest , becoming a tropical storm on August 3 and a typhoon a day later . Etau formed an eye and became a large storm by the time it approached Okinawa on August 7 . The typhoon attained peak winds of 155 km / h ( 100 mph ) before weakening slightly while turning to the northeast . Etau made landfall on the Japanese island of Shikoku on August 8 , and later moved across portions of Honshu and Hokkaido . After weakening to tropical storm status , the cyclone became extratropical on August 9 and dissipated three days later . 
- While passing northeast of the Philippines , the typhoon caused light damage in the archipelago . The eye crossed over Okinawa , where Etau left 166 @,@ 800 people without power and caused 10 injuries . Near where Etau first struck Japan , Muroto reported a peak wind gust of 166 km / h ( 103 mph ) , at the time the third strongest on record there . The typhoon also dropped torrential rainfall peaking at 683 mm ( 26 @.@ 9 in ) . The combination of winds and rainfall caused landslides , particularly on Hokkaido . Nationwide , Etau killed 20 people , destroyed 708 houses , and caused ¥ 35 @.@ 1 billion ( JPY , $ 294 @.@ 8 million USD ) in damage . 
- 
- = = = Typhoon Krovanh ( Niña ) = = = 
- 
- Krovanh originated from a tropical disturbance within the monsoon trough east of Chuuk State on August 13 . Despite rather favorable conditions , the initial tropical depression did not intensify significantly and degenerated into a remnant low on August 18 . However , these remnants were able to reorganize and the system was reclassified as a tropical cyclone a day later . Intensification was rather rapid upon the storm 's reformation – the depression reached tropical storm status on August 20 and then typhoon intensity two days later . Shortly after , Krovanh made landfall on Luzon at peak intensity with winds of 120 km / h ( 75 mph ) . The typhoon emerged into the South China Sea as a much weaker tropical storm , though it was able to restrengthen over warm waters . Once again at typhoon intensity , Krovanh clipped Hainan before moving over the Leizhou Peninsula on its way to a final landfall near Cẩm Phả , Vietnam on August 25 . Quick weakening due to land interaction occurred as Krovanh moved across northern Vietnam , where the storm met its demise the following day . 
- Krovanh first struck the Philippines , resulting in heavy rainfall and displacing approximately 1 @,@ 000 families . The flooding caused severe damage and killed one person . Krovanh 's effects were much more severe in China . In Hong Kong , eleven people were injured and isolated flooding occurred as a result of the typhoon 's outer rainbands . However , Guangdong Province , Hainan Province , and Guangxi were the Chinese regions most extensively impacted . The typhoon brought record wind gusts into Guangxi . In those three regions combined , 13 @,@ 000 homes were estimated to have collapsed and a large swath of farmland was damaged . Two people were killed in China and economic losses approximated to ¥ 2 @.@ 1 billion ( US $ 253 million ) . Due to its positioning and track , of all areas in Vietnam only the country 's more northern regions were impacted by Krovanh . Flash flooding occurred in earnest in those regions , and 1 @,@ 000 homes were flattened . One person was killed and five others were injured in Vietnam . Overall , the typhoon was responsible for the deaths of four persons . 
- 
- = = = Tropical Storm Vamco ( Manang ) = = = 
- 
- The monsoon trough spawned several tropical disturbances in the middle of August , one of which became Tropical Depression Lakay near the Philippines . On August 18 , an area of convection persisted on the southern side of a circulation , developing into a tropical depression east of Luzon . It moved quickly northward and later to the northwest in an area generally unfavorable for strengthening , such as the presence of wind shear and land interaction . On August 19 , the JMA upgraded the depression to Tropical Storm Vamco to the east of Taiwan . Later that day , the circulation passed just 55 km ( 35 mi ) north of Taipei , although the convection was exposed that time . On August 20 , the JMA assessed Vamco as dissipating in the Taiwan Strait , although the JTWC continued advisories until the storm moved ashore in southeastern China . 
- Rainfall in Taiwan reached 69 mm ( 2 @.@ 7 in ) in Ilan County . On the island , the storm left several hundred houses without power due to a lightning strike . On mainland China , rainfall peaked at 101 mm ( 4 @.@ 0 in ) in Wenzhou , Zhejiang , which were largely beneficial in easing drought conditions , while winds gusted to 100 km / h ( 62 mph ) . The storm damaged or destroyed 5 @,@ 880 houses and flooded 1 @,@ 287 ha ( 3 @,@ 180 acres ) of paddy fields , causing ¥ 38 @.@ 6 million ( CNY , $ 4 @.@ 7 million USD ) in damage . 
- 
- = = = Typhoon Dujuan ( Onyok ) = = = 
- 
- On August 27 , a tropical depression developed about 520 km ( 325 mi ) northwest of Guam , which initially drifted to the southwest before turning to the northwest . On August 29 , the JMA upgraded it to Tropical Storm Dujuan , and that day PAGASA began issuing advisories on Tropical Storm Onyok . The storm quickly intensified into a typhoon , after developing an eye in the center . On September 1 , the JMA estimated Dujuan attained peak 10 minute winds of 150 km / h ( 90 mph ) , and the JTWC assessed peak 1 – minute winds of 230 km / h ( 145 mph ) . While near peak intensity , the center of Dujuan passed about 45 km ( 30 mi ) south of the southern tip of Taiwan . The typhoon weakened steadily and was a severe tropical storm by the time it made landfall on September 2 just east of Hong Kong . The JTWC estimated landfall winds of 185 km / h ( 115 mph ) , making it the strongest typhoon to strike the Pearl River Delta since Typhoon Hope in 1979 . Dujuan rapidly weakened while continuing westward through China , dissipating on September 3 over Guangxi . 
- In the Philippines , Dujuan interacted with the monsoon to produce heavy rainfall , killing one person . While in the vicinity , Dujuan produced gusts of 100 km / h ( 62 mph ) on Yonaguni , a Japanese subdivision of Okinawa . Heavy rainfall in Taiwan reached 628 mm ( 24 @.@ 7 in ) in Pingtung County , and winds peaked at 176 km / h ( 109 mph ) on Orchid Island before the anemometer was destroyed . The caused about NT $ 200 ( NWD , $ 115 million USD ) in crop damage , and killed three people . Damage was minor in Hong Kong , and four fishermen were missing and presumed drowned after their boat sank . On the Chinese mainland , strong winds left 90 % of the city of Shenzen without power , and killed 16 construction workers due to a half @-@ finished building collapsing . Across Guangdong , the typhoon damaged crops and destroyed 54 @,@ 000 homes were destroyed . Overall damage in China was estimated at ¥ 2 @.@ 3 billion ( CNY , $ 277 million USD ) , and across Guangdong , the typhoon killed 40 people . 
- 
- = = = Typhoon Maemi ( Pogi ) = = = 
- 
- Typhoon Maemi formed on September 4 from the monsoon trough in the western Pacific Ocean . It slowly intensified into a tropical storm while moving northwestward , and Maemi became a typhoon on September 8 . That day , it quickly intensified due to favorable conditions , developing a well @-@ defined eye and reaching peak maximum sustained winds of 195 km / h ( 120 mph ) . While near peak intensity , Maemi was decelerating and began turning to the north @-@ northeast . The eyewall soon after passed over the Japanese island of Miyako @-@ jima on September 10 , producing the fourth lowest pressure on record in Japan after a pressure of 912 mbar ( 26 @.@ 9 inHg ) was recorded . With warm waters , Maemi was able to maintain much of its intensity before it made landfall just west of Busan , South Korea on September 12 . On Jeju Island , Maemi produced a peak wind gust of 216 km / h ( 134 mph ) and a minimum pressure of 950 mbar ( 28 inHg ) , both setting records for the country , and making it the most powerful typhoon to strike South Korea since record @-@ keeping began in the country in 1904 . The typhoon became extratropical in the Sea of Japan the next day , although the remnants persisted for several more days , bringing strong winds to northern Japan . 
- The typhoon first affected the Ryukyu Islands of Japan . On Miyako @-@ jima , strong winds damaged 104 buildings , and 95 % of residents lost power . Maemi dropped heavy rainfall there , including rates of 58 @.@ 5 mm ( 2 @.@ 30 in ) in an hour , and 402 @.@ 5 mm ( 15 @.@ 85 in ) in 24 hours , the latter setting a record . One person died on Miyako @-@ jima after being struck by flying glass . Elsewhere in Japan , the storm caused flights to be canceled , while rainfall @-@ induced landslides blocked roads . There were two other deaths in Japan , and damage totaled ¥ 11 @.@ 3 billion ( JPY , $ 96 million USD ) . Damage was heaviest in South Korea , notably where it moved ashore . Winds in Busan near the landfall location reached 154 km / h ( 96 mph ) , the second @-@ highest on record . There , the port sustained heavy damage , causing disruptions to exports in the months following the storm . Nationwide , the high winds destroyed about 5 @,@ 000 houses and damaged 13 @,@ 000 homes and businesses , leaving 25 @,@ 000 people homeless . About 1 @.@ 47 million lost power , and widespread crop damage occurred , resulting in the worst rice crop in 23 years . Across South Korea , Maemi killed 117 people , and overall damage totaled ₩ 5 @.@ 52 trillion won ( $ 4 @.@ 8 billion USD ) . 
- 
- = = = Typhoon Choi @-@ wan ( Roskas ) = = = 
- 
- In the middle of September , the monsoon trough spawned a rapidly organizing disturbance east @-@ northeast of Luzon , with weak wind shear and favorable conditions . On September 16 , the JMA classified it as a tropical depression , and the JTWC initiated advisories the next day . The system moved to the northwest due to the subtropical ridge to the northeast and later to the north . On September 18 , the JMA upgraded the depression to Tropical Storm Choi @-@ wan , the same day that PAGASA classified it as Tropical Storm Roskas . An eastward @-@ moving trough turned the storm to the northeast , bringing the track over Okinawa and Amami Ōshima on September 19 . Choi @-@ wan continued gradually intensifying , becoming a typhoon on September 20 to the southeast of Japan . That day , the JMA estimated peak winds of 130 km / h ( 80 mph ) , and the JTWC estimated peak 1 minute winds of 185 km / h ( 115 mph ) on September 21 , after Choi @-@ wan developed a well @-@ defined eye . The typhoon weakened due to increasing wind shear , deteriorating to severe tropical storm status on September 22 before JMA declared it extratropical on September 23 . The remnants of Choi @-@ wan continued to the northeast , exited the basin on September 24 , and eventually struck southern Alaska on September 25 . 
- Wind gusts in Okinawa reached 115 km / h ( 72 mph ) , while on the volcanic island of Hachijō @-@ jima , gusts reached 214 km / h ( 133 mph ) . On the Japanese mainland , winds gusted to 126 km / h ( 78 mph ) at Chōshi , Chiba . Choi @-@ wan dropped heavy rainfall while near Japan , peaking at 316 mm ( 12 @.@ 4 in ) on Miyake @-@ jima . In Okinawa , Choi @-@ wan flooded a boat , forcing its occupants to be rescued by the Coast Guard . Also on the island , heavy rainfall caused landslides and flooded houses . In Amami Ōshima , the storm left 10 @,@ 810 people without power . On Hachijō , wind gusts of 214 km / h ( 133 mph ) damaged about 200 houses . Nationwide , Choi @-@ wan destroyed 191 homes , injured 9 people , and left about ¥ 300 million ( JPY , $ 2 @.@ 5 million USD ) . 
- 
- = = = Typhoon Koppu ( Sikat ) = = = 
- 
- Towards the end of September , the monsoon trough spawned a tropical disturbance east @-@ northeast of Yap , which became a tropical depression on September 24 . There were initially several circulations , with a tropical upper tropospheric trough to the northeast increasing outflow . After slowing and turning to the northeast , the depression intensified into Tropical Storm Koppu on September 26 . After the storm developed a large eye feature , the JTWC upgraded it to typhoon status on September 27 , although the JMA did not follow suit until the following day while near Chichi @-@ jima . Also that day , Koppu passed 95 km ( 60 mi ) west of Iwo Jima , and the JMA estimated peak 10 minute winds of 130 km / h ( 80 mph ) . The JTWC estimated peak 1 minute winds of 165 km / h ( 105 mph ) , before an approaching trough caused the typhoon to accelerate northeastward . The convection diminished near the center , causing Koppu to become extratropical on September 30 . The remnants continued generally northeastward through the Aleutian Islands , eventually passing south of mainland Alaska on October 7 . 
- On Chichi @-@ jima , Typhoon Koppu produced sustained winds of 102 km / h ( 63 mph ) , with gusts to 200 km / h ( 124 mph ) , which was the third strongest on record for the station . Rainfall there reached 183 mm ( 7 @.@ 2 in ) . Wind gusts on Iwo Jima peaked at 109 km / h ( 68 mph ) . 
- 
- = = = Typhoon Ketsana ( Tisoy ) = = = 
- 
- In the middle of October , an area of convection persisted along the monsoon trough between Luzon and Guam , developing into a tropical depression on October 17 . The same monsoon trough later spawned Typhoon Parma to the east . For several days , the system remained disorganized while drifting to the west @-@ northwest due to weak steering currents south of the subtropical ridge . On October 19 , the JMA upgraded the depression to Tropical Storm Ketsana , and by that time the storm had begun drifting to the northeast . With favorable outflow , Ketsana quickly intensified into a typhoon on October 20 after developing an eye , and two days later , the JMA estimated peak winds of 165 km / h ( 105 mph ) . The JTWC estimated peak 1 minute winds of 230 km / h ( 145 mph ) around the time the well @-@ defined eye had expanded to 37 km ( 23 mi ) . Subsequently , the typhoon accelerated northeastward into the westerlies and began weakening due to increasing wind shear and dry air . On October 26 , Ketsana became extratropical to the east of Japan and dissipated the next day . The passage of the typhoon caused surface chlorophyll a concentration in the ocean to increase 30 @-@ fold . 
- 
- = = = Typhoon Parma = = = 
- 
- The same monsoon trough that spawned Typhoon Koppu also produced an area of convection to the north @-@ northeast of Guam , becoming a tropical depression on October 19 . The system moved northwestward and later turned to the northeast around the subtropical ridge . With low wind shear and favorable outflow , the convection became better organized , and the JMA upgraded it to Tropical Storm Parma on October 21 . After an eye began to form , Parma was upgraded to typhoon status the next day . An approaching trough caused Parma to accelerate northeastward while also increasing outflow . On October 24 , the JMA estimated peak winds of 175 km / h ( 110 mph ) while the JTWC estimated winds of 240 km / h ( 150 mph ) , an unusually high intensity for 30 ° N. Subsequently , Parma rounded the subtropical ridge and began moving to the east @-@ southeast , beginning a nearly week @-@ long loop . The cold front had passed to the north and failed to bring the typhoon northeastward . 
- Increasing wind shear weakened the convection , and Parma deteriorated into a severe tropical storm on October 26 . The next day , it began moving westward while passing about 345 km ( 215 mi ) north of Wake Island . A large eye of 110 km ( 70 mi ) in diameter developed , and on October 28 , the JMA re @-@ upgraded Parma to typhoon status . The next day , the typhoon turned to the northeast due to another approaching trough . With decreasing wind shear and warmer waters , Parma re @-@ intensified significantly on October 29 , reaching a secondary peak of 165 km / h ( 105 mph ) according to JMA , and 215 km / h ( 135 mph ) according to JTWC . The storm moved very closely along the track it took several days prior . Increasing wind shear on October 30 caused rapid weakening , resulting in the eye dissipating . By the next day , the center was exposed , and Parma became extratropical , later exiting the basin on November 1 . The remnants weakened , later turning to the southeast and dissipating on November 11 southwest of California . 
- 
- = = = Severe Tropical Storm Melor ( Viring ) = = = 
- 
- Late in October , an area of convection persisted northwest of Palau and quickly organized into a tropical depression on October 29 . Moving west @-@ northwestward toward the Philippines due to a ridge to the east , the depression intensified into Tropical Storm Melor on October 30 . With minimal wind shear , the storm strengthened further and developed a large eye . The JTWC upgraded Melor to typhoon status on October 31 , estimating peak 1 minute winds of 140 km / h ( 85 mph ) , although the JMA estimated the storm only attained peak 10 minute winds of 95 km / h ( 60 mph ) . Early on November 1 , Melor made landfall on northeastern Luzon in the Philippines , south of Palanan . The storm weakened over land and emerged into the South China Sea . By that time , it was moving northward along the periphery of the ridge to the east . On November 2 , Melor turned to the northeast , passing just east of Taiwan . The next day , it weakened to tropical depression status , and after meandering offshore eastern Taiwan , Melor continued northeastward . It became extratropical on November 5 and dissipated the next day near southern Japan . 
- In the Philippines , Melor dropped about 150 mm ( 6 in ) of rainfall , which flooded the Cagayan River and killed four people . In Taiwan , rainfall reached 554 mm ( 21 @.@ 8 in ) in Pingtung County . On the Japanese island of Hateruma , rainfall totaled 197 mm ( 7 @.@ 8 in ) , which broke the hourly and daily record for the station in November . 
- 
- = = = Typhoon Nepartak ( Weng ) = = = 
- 
- A tropical depression developed near Yap on November 11 . The system intensified gradually as it tracked quickly westward toward the Philippines . An anticyclone aloft allowed for strengthening , and the JMA upgraded the depression to Tropical Storm Nepartak on November 12 . Simultaneously , the cyclone entered the area of responsibility of the Philippine Atmospheric , Geophysical and Astronomical Services Administration , which named it Tropical Storm Weng . At around 1600 UTC on November 13 , Nepartak made landfall on northern Samar Island in the Philippines before traversing the remainder of the archipelago from east to west . The cyclone emerged into the South China Sea weakened but quickly re @-@ intensified while continuing to the west @-@ northwest . The JTWC estimated peak winds of 140 km / h ( 85 mph ) on November 16 , and later that day , the JMA upgraded the storm to typhoon status , estimating peak 10 @-@ minute winds of 120 km / h ( 75 mph ) . On November 18 , Nepartak passed near southwestern Hainan and weakened , with the convection diminishing from the circulation . By the next day , the system weakened to tropical depression status , and dissipated shortly after moving ashore over Beihai , China . 
- In the Philippines , Nepartak produced strong winds , heavy rainfall , and rough seas . The storm caused widespread power outages and ferry disruptions . According to the PAGASA in its post @-@ storm report , a total of 13 people lost their lives during the storm . On Hainan , the storm helped end one of the worst summer droughts in almost 65 years , although it also left heavy crop damage , wrecking 64 @,@ 000 ha ( 160 @,@ 000 acres ) of fields and killing 400 head of livestock . With about 800 homes destroyed , damage on Hainan amounted to $ 197 million ( 2003 USD ) . Effects were minor in mainland China . 
- 
- = = = Typhoon Lupit ( Yoyoy ) = = = 
- 
- Typhoon Lupit formed on November 18 from the monsoon trough to the west of the Marshall Islands . Early in its duration , it moved generally to the west or west @-@ southwest . On November 21 , the depression intensified into Tropical Storm Lupit , and two days later , it strengthened into a typhoon , developing an eye . Lupit later began a prolonged movement to the northwest , during which it passed near several islands in Yap State . The typhoon reached peak intensity on November 26 , with peak 10 – minute sustained winds of 185 km / h ( 115 mph ) . It later weakened due to a variety of unfavorable conditions , and after recurving to the northeast , Lupit became extratropical south of Japan on December 2 . 
- Typhoon Lupit first affected Pohnpei with gusty winds , and later it damaged or destroyed about 200 homes in Chuuk State . There , high waves flooded roads and homes , while high winds damaged crops . Damage was heaviest in Yap State , mostly in the small Ulithi atoll and Fais Island . On both islands , the typhoon contaminated the water supply and wrecked the crops . Rainfall reached 263 mm ( 10 @.@ 35 in ) on Ulithi , and gusts reached 158 km / h ( 98 mph ) . Throughout the FSM , damage totaled about $ 1 @.@ 7 million , although there were no deaths . The damage prompted the FSM government to declare two states as disaster areas , as well as a disaster declaration from the United States federal government . While Lupit was becoming extratropical , it became the first typhoon in December to threaten Japan in 13 years . The storm dropped rainfall that resulted in mudslides and flight cancellations . 
- 
- = = = Other storms = = = 
- 
- An area of convection formed on May 16 to the southwest of Palau , located within an area of weak wind shear . The next day , the JMA and the JTWC both classified the system as a tropical depression . The convection was disorganized in association with multiple circulation centers , although it gradually organized . Moving westward initially , the depression turned more to the north into an area of increasing wind shear . On May 19 , the JTWC upgraded the system to Tropical Storm 03W , and on the same day , PAGASA classified it as Tropical Depression Batibot . Soon after , the convection decreased from the center , and by May 20 , all warning agencies had discontinued advisories . 
- On July 9 , PAGASA classified a system as Tropical Depression Falcon , off the west coast of the Philippines . The JMA also briefly initiated advisories before dropping them later on July 9 . Later in the month , PAGASA briefly issued advisories on Tropical Depression Ineng on July 30 off the east coast of Mindanao . The depression dissipated the next day , causing about P8 million ( PHP , $ 145 @,@ 000 USD ) in damage . The monsoon trough spawned a tropical depression north of Luzon on August 18 , with PAGASA naming it Lakay . There were several circulations in the region , with Tropical Storm Vamco to the northeast near Taiwan , and the overall system moved generally westward . PAGASA briefly classified Lakay as a tropical storm on August 19 before ending advisories the next day . The system spread rainfall across China , reaching 82 @.@ 4 mm ( 3 @.@ 24 in ) in Xiamen , Fujuan . 
- On September 5 , former Hurricane Jimena crossed the International Date Line into the basin . By that time , the circulation was largely exposed from the convection , and the center quickly dissipated . Later in the month , the monsoon trough spawned a disturbance east of the Philippines that PAGASA classified as Tropical Depression Quiel on September 15 . The system moved westward but never intensified , dissipating west of Luzon on September 19 . The broad system also spawned Typhoon Choi @-@ wan . 
- In October , the JTWC classified Tropical Depression 18W early in the month off the west coast of Luzon . With weak steering currents , the system moved slowly southwestward before looping to the northwest . On October 10 , the depression dissipated just off the coast of southern China . On October 5 , the JMA monitored a tropical depression southeast of Taiwan that later passed near the island , producing heavy rainfall that peaked at 153 mm ( 6 @.@ 0 in ) in Ilan County . A few days later , the JTWC monitored Tropical Depression 19W , which developed on October 12 after an extratropical storm produced an area of convection . Described as a subtropical low , the depression moved generally northeastward toward Japan due to an approaching cold front . The depression moved through Kyushu and Honshu before dissipating on October 13 . The depression dropped 285 mm ( 11 @.@ 2 in ) of rainfall in Kōchi , while strong winds associated reached 217 km / h ( 135 mph ) through a storm @-@ produced downburst . The winds knocked over two cranes , killing two people , and left about 9 @,@ 000 homes without power . The depression also killed two people due to drownings . On October 16 , the JMA briefly classified a tropical depression to the east of the Marianas Islands . On October 22 , a tropical depression developed in the South China Sea , classified by PAGASA as Ursula . The system moved eastward and crossed Palawan before dissipating on October 24 . In the Philippines , the depression killed one person and caused minor damage . Also in October , the monsoon trough spawned a tropical depression in the Gulf of Thailand , which moved northwestward and crossed into the Indian Ocean , dropping heavy rainfall in Thailand . 
- In mid @-@ November , the JMA briefly tracked a weak tropical depression near Wake Island . The agency also briefly tracked a tropical depression off the coast of Vietnam on December 16 . It finally dissipated on December 17 , with the pressure and winds unknown . The final system of the year was a tropical depression that originated out of the monsoon trough on December 24 east of the Philippines . After initially moving to the west , it turned to the south , and the JTWC estimated the depression intensified into a tropical storm . During this time , PAGASA classified it as Tropical Storm Zigzag . The system made landfall in northeastern Mindanao and dissipated on December 27 , bringing heavy rainfall . 
- 
- = = Storm names = = 
- 
- Within the North @-@ western Pacific Ocean , both the Japan Meteorological Agency ( JMA ) and the Philippine Atmospheric , Geophysical and Astronomical Services Administration assign names to tropical cyclones that develop in the Western Pacific , which can result in a tropical cyclone having two names . The Japan Meteorological Agency 's RSMC Tokyo — Typhoon Center assigns international names to tropical cyclones on behalf of the World Meteorological Organization 's Typhoon Committee , should they be judged to have 10 @-@ minute sustained windspeeds of 65 km / h , ( 40 mph ) . While the Philippine Atmospheric , Geophysical and Astronomical Services Administration assigns names to tropical cyclones which move into or form as a tropical depression in their area of responsibility located between 135 ° E and 115 ° E and between 5 ° N @-@ 25 ° N even if the cyclone has had an international name assigned to it . The names of significant tropical cyclones are retired , by both PAGASA and the Typhoon Committee . Should the list of names for the Philippine region be exhausted then names will be taken from an auxiliary list of which the first ten are published each season . Unused names are marked in gray . 
- 
- = = = International names = = = 
- 
- During the season 21 named tropical cyclones developed in the Western Pacific and were named by the Japan Meteorological Agency , when it was determined that they had become tropical storms . These names were contributed to a list of a 140 names submitted by the fourteen members nations and territories of the ESCAP / WMO Typhoon Committee . 
- 
- = = = Philippines = = = 
- 
- The Philippine Atmospheric , Geophysical and Astronomical Services Administration uses its own naming scheme for tropical cyclones in their area of responsibility . PAGASA assigns names to tropical depressions that form within their area of responsibility and any tropical cyclone that might move into their area of responsibility . Should the list of names for a given year prove to be insufficient , names are taken from an auxiliary list , the first 10 of which are published each year before the season starts . The names not retired from this list will be used again in the 2007 season . Names that were not assigned are marked in gray . 
- 
- = = = Retirement = = = 
- 
- The names Imbudo and Maemi were retired by the ESCAP / WMO Typhoon Committee . The names Molave and Mujigae were chosen to replace Imbudo and Maemi respectively . While Yanyan was requested by Hong Kong to be removed in the list , and was replaced by Dolphin . The Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) announced that the name Harurot had its name retired due to extensive damage . The name Hanna was chosen to replace Harurot . Also , the name " Koni " was replaced by " Goni " , after it was found that Koni was a misspelling . 
- 
- = = Storm effects = = 
- 
- The following table provides basic meteorological and impact information for each tropical cyclone from the 2003 Pacific typhoon season in tabular format ; unnamed tropical cyclones are not included . PAGASA names for storms are provided in parentheses . Storms entering from the Central Pacific only include their information while in the western Pacific , and are noted with an asterisk * . 
- 
- 
- = Oxaziridine = 
- 
- An oxaziridine is an organic molecule that features a three @-@ membered heterocycle containing oxygen , nitrogen , and carbon . In their largest application , oxazidines are intermediates in the industrial production of hydrazine . Oxaziridine derivatives are also used as specialized reagents in organic chemistry for a variety of oxidations , including alpha hydroxylation of enolates , epoxidation and aziridination of olefins , and other heteroatom transfer reactions . Oxaziridines also serve as precursors to amides and participate in [ 3 + 2 ] cycloadditions with various heterocumulenes to form substituted five @-@ membered heterocycles . Chiral oxaziridine derivatives effect asymmetric oxygen transfer to prochiral enolates as well as other substrates . Some oxaziridines also have the property of a high barrier to inversion of the nitrogen , allowing for the possibility of chirality at the nitrogen center . 
- 
- = = History = = 
- 
- Oxaziridine derivatives were first reported in the mid @-@ 1950s by Emmons and subsequently by Krimm and Horner and Jürgens . Whereas oxygen and nitrogen typically act as nucleophiles due to their high electronegativity , oxaziridines allow for electrophilic transfer of both heteroatoms . This unusual reactivity is due to the presence of the highly strained three membered ring and the relatively weak N @-@ O bond . Nucleophiles tend to attack at the aziridine nitrogen when the nitrogen substituent is small ( R1 = H ) , and at the oxygen atom when the nitrogen substituent has greater steric bulk . The unusual electronics of the oxaziridine system may be exploited to perform a number of oxygen and nitrogen transfer reactions including , but not limited to : α @-@ hydroxylation of enolates , epoxidation of alkenes , selective oxidation of sulfides and selenides , amination of N @-@ nucleophiles and N @-@ acylamidation . 
- The Peroxide process for the industrial production of hydrazine through the oxidation of ammonia with hydrogen peroxide in the presence of ketones was developed in the early 1970s . 
- Chiral camphorsulfonyloxaziridines proved useful in the syntheses of complex natural product , such as taxol which is marketed as a chemotherapy agent . Both the Holton Taxol total synthesis and the Wender Taxol total synthesis feature asymmetric α @-@ hydroxylation with camphorsulfonyloxaziridine . 
- 
- = = Synthesis = = 
- 
- 
- = = = N @-@ H , N @-@ Alkyl , N @-@ Aryloxaziridines = = = 
- 
- The two main approaches to synthesis of N @-@ H , N @-@ alkyl , and N @-@ aryloxaziridines are oxidation of imines with peracids ( A ) and amination of carbonyls ( B ) . 
- Additionally , oxidation of chiral imines and oxidation of imines with chiral peracids may yield enantiopure oxaziridines . Some oxaziridines have the unique property of configurationally stable nitrogen atoms at room temperature due to an inversion barrier of 24 to 31 kcal / mol . Enantiopure oxaziridines where stereochemistry is entirely due to configurationally stable nitrogen are reported . 
- 
- = = = N @-@ Sulfonyloxaziridines = = = 
- 
- In the late 1970s and early 1980s Franklin A. Davis synthesized the first N @-@ sulfonyloxaziridines , which act exclusively as oxygen transfer reagents , and are the most predominantly used class of oxaziridines today . While originally synthesized with mCPBA and the phase transfer catalyst benzyltrimethylammonium chloride , an improved synthesis using oxone as the oxidant is now most prevalent . 
- Many N @-@ sulfonyloxaziridines are used today , each with slightly different properties and reactivity . These reagents are summarized in the table below . 
- 
- = = = Perfluorinated oxaziridines = = = 
- 
- With highly electron withdrawing perfluoroalkyl substituents , oxaziridines exhibit reactivity more similar to that of dioxiranes than typical oxaziridines . Notably , perfluoroalkyloxaziridines hydroxylate certain C @-@ H bonds with high selectivity . Perfluorinated oxaziridines may be synthesized by subjecting a perfluorinated imine to perfluoromethyl fluorocarbonyl peroxide and a metal fluoride to act as an HF scavenger . 
- 
- = = Reactions of oxaziridines = = 
- 
- 
- = = = Hydrazine production = = = 
- 
- Oxaziridines are intermediates in the Peroxide process for the production of hydrazine . Many millions of kilograms of hydrazine are produced annually by this method that involves a step wherein ammonia is oxidized in the presence of methyl ethyl ketone to give the oxaziridine : 
- Me ( Et ) C = O + NH3 + H2O2 → Me ( Et ) CONH + H2O 
- In subsequent steps the oxaziridine is converted to the hydrazone , which is the immediate en route to hydrazine : 
- Me ( Et ) CONH + NH3 → Me ( Et ) C = NNH2 + H2O 
- 
- = = = Oxygen transfer = = = 
- 
- 
- = = = = α @-@ Hydroxylation of enolates = = = = 
- 
- α @-@ Hydroxyketones , or acyloins , are an important synthetic motifs present in many natural products. α @-@ Hydroxyketones have been synthesized in many ways , including reduction of α @-@ diketones , substitution of a hydroxyl for a leaving group and direct oxidation of an enolate . Oxodiperoxymolybdenum ( pyridine ) - ( hexamethylphosphoric triamide ) ( MoOPH ) and N @-@ sulfonyloxaziridines are the most common electrophilic sources of oxygen implemented in this process . One advantage of using N @-@ sulfonyloxaziridines is that higher chiral induction is almost invariably observed relative to MoOPH and other oxidants . High yield ( 77 @-@ 91 % ) and dr ( 95 : 5 - 99 : 1 ) are reported for α @-@ hydroxylation with the Evans ' chiral auxiliary with N @-@ sulfonyloxaziridine as the electrophile . Chiral induction has been demonstrated with many other chiral ketones and ketones with chiral auxiliaries , including SAMP and RAMP . 
- Extensive work has been reported on asymmetric hydroxylation of prochiral enolates with camphorsulfonyloxaziridine derivatives , achieving moderate to high enantiomeric excess . The commonly accepted proposed transition state that justifies this stereochemical outcome involves an open transition state where the steric bulk of R1 determines the face of approach . 
- The selectivity of some hydroxylations may be drastically improved in some cases with the addition of coordinating groups alpha to the oxaziridine ring as oxaziridines 3b and 3c in the table above . In these instances it is proposed that the reaction proceeds through a closed transition state where the metal oxyanion is stabilized by chelation from the sulfate and coordinating groups on the camphor skeleton . 
- α @-@ Hydroxylation with oxaziridines has been widely implemented in total synthesis . It is a key step in both the Holton Taxol total synthesis and the Wender Taxol total synthesis . Additionally , Forsyth implemented the transformation in his synthesis of the C3 @-@ C14 ( Substituted 1 @,@ 7 @-@ Dioxaspiro [ 5 @.@ 5 ] undec @-@ 3 @-@ ene ) System of Okadaic acid . 
- 
- = = = = Epoxidation of alkenes = = = = 
- 
- Epoxidation of alkenes is a common reaction because epoxides can be derivatized in a number of useful ways . Classically , laboratory epoxidation is carried out with mCPBA or other peracids . Oxaziridines have been found to be useful for the formation of highly acid sensitive epoxides . ( − ) -Chaetominine was synthesized via oxaziridine epoxidation as a late stage transformation as seen below . 
- Another transformation of high synthetic utility is asymmetric epoxidation . A number of asymmetric epoxidations exist : the Sharpless epoxidation , the Jacobsen @-@ Katsuki epoxidation , and the Juliá @-@ Colonna Epoxidation . These methods require specific functionality in order to achieve selectivity . The Sharpless epoxidation is specific to allylic alcohols , the Jacobsen epoxidation requires cis @-@ disubstituted aryl alkenes , and the Juliá epoxidation requires α @-@ β unsaturated ketones . Chiral oxaziridines act stereospecifically on many unfunctionalized alkenes . It has even possible to effect stereospecific epoxidation catalytically in the oxaziridine chiral unit . Further investigation into these reactions may be required before levels of enantiometic excess become practical for large scale synthesis . Lusinichi et al. have investigated asymmetric epoxidation with a chiral oxaziridinium salt using oxone as the stoichiometric oxidant seen below . 
- 
- = = = = Hydroxylation of unactivated hydrocarbons = = = = 
- 
- Perfluorinated oxaziridines are known to hydroxylate unactivated hydrocarbons with remarkable regio , and diastereospecificity . This is a highly coveted transformation , and similar reactivity and specificity is seldom rivaled , especially considering the nonmetallic nature of the oxidant . Perfluorinated oxaziridines show high selectivity toward tertiary hydrogens . Hydroxylation of primary carbons and dihydroxylation of a compound with two oxidizable sites have never been observed . Retention of stereochemistry is very high , often 95 - 98 % . ( retenton of stereochemistry may be further enhanced by the addition of a fluoride salt ) . 
- 
- = = = Nitrogen transfer = = = 
- 
- Oxaziridines with unsubstituted or acylated nitrogens are capable of nitrogen atom transfer , although this reactivity has received considerably less attention . 
- 
- = = = = Amination of N @-@ nucleophiles = = = = 
- 
- Amination of nucleophiles with N @-@ unsubstituted oxaziridines is quite versatile in the breadth of possible nucleophiles and corresponding products . Hydrazines may be derived from the amination of secondary or tertiary amines , hydroxylamine and thiohydroxamines may be formed from their corresponding alcohols and thiols , sulfimides may be formed from thioethers and α @-@ aminoketones may be formed by attack of corresponding enolates . 
- 
- = = = = N @-@ acylamidation = = = = 
- 
- The transfer of acylated amines is more difficult than that of unsubstituted amines , although , unlike amine transfer by oxaziridines , there are no alternative methods that directly transfer acylated amines . Acylamine transfer has primarily been performed using amines and hydrazines as nucleophiles . Very few transfers of acylated nitrogens to carbon nucleophiles have been successfully performed , although some do exist in the literature . 
- 
- = = = Rearrangements = = = 
- 
- Oxaziridines have been found to undergo rearrangement reactions via a radical mechanism when irradiated with UV light or in the presence of a single electron transfer reagent such as CuI. spirocylic oxaziridines undergo ring expansions to the corresponding lactam . Interestingly , the migrating substituent is determined by a stereoelectronic effect where the group trans to the lone pair on the nitrogen will always be the predominant migration product . In light of this effect , it is possible to take advantage of the chiral nitrogen due to high inversion barrier to direct the rearrangement . This phenomenon is demonstrated by observed selectivities in the rearrangements below . In the rearrangement on the left the thermodynamically unfavorable product is observed exclusively , while in the reaction on the right the product derived from the less stable radical intermediate is favored . 
- Aubé takes advantage of this rearrangement as the key step in his synthesis of ( + ) -yohimbine , a natural medicine classified by the NIH as possibly effective in the treatment of erectile dysfunction and the sexual problems caused by selective serotonin reuptake inhibitors . 
- It is also notable that oxaziridines will thermally rearrange to nitrones . Cis @-@ trans selectivity of the resulting nitrone is poor , however , yields are good to excellent . It is thought that some oxaziridines racemize over time through a nitrone intermediate . 
- 
- = = = Cycloaddions with heterocumulenes = = = 
- 
- Oxaziridines undergo cycloaddition reactions with heterocumulenes to afford a number of unique five membered heterocycles , as depicted in the figure below . This reactivity is due to the strained three membered ring and weak N @-@ O bond . 
- 
- 
- = Battle of Dürenstein = 
- 
- The Battle of Dürenstein ( also known as the Battle of Dürrenstein , Battle of Dürnstein and Battle of Diernstein ; German : Gefecht bei Dürrenstein ) , on 11 November 1805 was an engagement in the Napoleonic Wars during the War of the Third Coalition . Dürenstein ( modern Dürnstein ) is located in the Wachau Valley , on the River Danube , 73 kilometers ( 45 mi ) upstream from Vienna , Austria . The river makes a crescent @-@ shaped curve between Dürnstein and nearby Krems an der Donau and the battle was fought in the flood plain between the river and the mountains . 
- At Dürenstein a combined force of Russian and Austrian troops trapped a French division commanded by Théodore Maxime Gazan . The French division was part of the newly created VIII Corps , the so @-@ called Corps Mortier , under command of Édouard Mortier . In pursuing the Austrian retreat from Bavaria , Mortier had over @-@ extended his three divisions along the north bank of the Danube . Mikhail Illarionovich Kutuzov , commander of the Coalition force , enticed Mortier to send Gazan 's division into a trap and French troops were caught in a valley between two Russian columns . They were rescued by the timely arrival of a second division , under command of Pierre Dupont de l 'Étang . The battle extended well into the night . Both sides claimed victory . The French lost more than a third of their participants , and Gazan 's division experienced over 40 percent losses . The Austrians and Russians also had heavy losses--close to 16 percent--but perhaps the most significant was the death in action of Johann Heinrich von Schmitt , one of Austria 's most capable chiefs of staff . 
- The battle was fought three weeks after the Austrian capitulation at Ulm and three weeks before the Russo @-@ Austrian defeat at the Battle of Austerlitz . After Austerlitz Austria withdrew from the war . The French demanded a high indemnity and Francis II abdicated as Holy Roman Emperor , releasing the German states from their allegiance to the Holy Roman Empire . 
- 
- = = Background = = 
- 
- In a series of conflicts from 1803 @-@ 15 known as the Napoleonic Wars , various European powers formed five coalitions against the First French Empire . Like the wars sparked by the French Revolution ( 1789 ) , these further revolutionized the formation , organization and training of European armies and led to an unprecedented militarization , mainly due to mass conscription . Under the leadership of Napoleon , French power rose quickly as the Grande Armée conquered most of Europe , and collapsed rapidly after the disastrous invasion of Russia in 1812 . Napoleon 's empire ultimately suffered complete military defeat in the 1813 – 14 campaigns , resulting in the restoration of the Bourbon monarchy in France . Although Napoleon made a spectacular return in 1815 , known as the Hundred Days , his defeat at the Battle of Waterloo , the pursuit of his army and himself , his abdication and banishment to the Island of Saint Helena concluded the Napoleonic Wars . 
- 
- = = Danube campaign = = 
- 
- From 1803 @-@ 06 the Third Coalition fought the First French Empire and its client states ( see table at right ) . Although several naval battles determined control of the seas , the outcome of the war was decided on the continent , predominantly in two major land operations in the Danube valley : the Ulm campaign in the upper Danube and the Vienna campaign , in the middle Danube valley . 
- Political conflicts in Vienna delayed Austria 's entry into the Third Coalition until 1805 . After hostilities of the War of the Second Coalition ended in 1801 , Archduke Charles--the emperor 's brother--took advantage of the subsequent years of peace to develop a military restructuring plan . He carefully put this plan into effect beginning in 1803 – 04 , but implementation was incomplete in 1805 when Karl Mack , Lieutenant Field Marshal and Quartermaster @-@ General of the Army , implemented his own restructuring . Mack bypassed Charles ' methodical approach . Occurring in the field , Mack 's plan also undermined the overall command and organizational structure . Regardless , Mack sent an enthusiastic report to Vienna on the military 's readiness . Furthermore , after misreading Napoleon 's maneuvers in Württemberg , Mack also reported to Vienna on the weakness of French dispositions . His reports convinced the war party advising the emperor , Francis II , to enter the conflict against France , despite Charles ' own advice to the contrary . Responding to the report and rampant anti @-@ French fever in Vienna , Francis dismissed Charles from his post as generalissimo and appointed his Francophobic brother @-@ in @-@ law , Archduke Ferdinand , as commander . 
- The inexperienced Ferdinand was a poor choice of replacement for the capable Charles , having neither maturity nor aptitude for the assignment . Although Ferdinand retained nominal command , day @-@ to @-@ day decisions were placed in the hands of Mack , equally ill @-@ suited for such an important assignment . When Mack was wounded early in the campaign , he was unable to take full charge of the army . Consequently , command further devolved to Lieutenant Field Marshal Karl Philipp , Prince of Schwarzenberg , an able cavalry officer but inexperienced in the command of such a large army . 
- 
- = = = Road to Ulm = = = 
- 
- The campaign in the upper Danube valley began in October , with several clashes in Swabia . Near the Bavarian town of Wertingen , 40 kilometers ( 25 mi ) northwest of Augsburg , on 8 October the 1st Regiment of dragoons , part of Murat 's Reserve Cavalry Corps , and grenadiers of Lannes ' V Corps surprised an Austrian force half its size . The Austrians were arrayed in a line and unable to form their defensive squares quickly enough to protect themselves from the 4 @,@ 000 dragoons and 8 @,@ 000 grenadiers . Nearly 3 @,@ 000 Austrians were captured and over 400 were killed or wounded . A day later , at another small town , Günzburg--immediately south of the Danube River--the French 59th Regiment of the Line stormed a bridge over the Danube and , humiliatingly , chased two large Austrian columns toward Ulm . 
- The campaign was not entirely bad news for Vienna . At Haslach , Johann von Klenau arranged his 25 @,@ 000 infantry and cavalry in a prime defensive position and , on 11 October , the overly confident General of Division Pierre Dupont de l 'Étang attacked Klenau 's force with fewer than 8 @,@ 000 men . The French lost 1 @,@ 500 men killed and wounded . Aside from taking the Imperial Eagles and guidons of the 15th and 17th Dragoons , Klenau 's force also captured 900 men , 11 guns and 18 ammunition wagons . 
- Klenau 's victory was a singular success . On 14 October Mack sent two columns out of Ulm in preparation for a breakout to the north : one under Johann Sigismund Riesch headed toward Elchingen to secure the bridge there , and the other under Franz von Werneck went north with most of the heavy artillery . Recognizing the opportunity , Marshal Michel Ney hurried the rest of his VI Corps forward to re @-@ establish contact with Dupont , who was still north of the Danube . In a two @-@ pronged attack Ney sent one division to the south of Elchingen on the right bank of the Danube . This division began the assault at Elchingen . At the same time another division crossed the river to the east and moved west against Riesch 's position . After clearing Austrian pickets from a bridge , the French attacked and captured a strategically located abbey at the top of the hill at bayonet point . The Austrian cavalry unsuccessfully tried to fend off the French , but the Austrian infantry broke and ran . In this engagement alone , the Austrians lost more than half their reserve artillery park , 6 @,@ 000 ( out of 8 @,@ 000 total participants ) dead , wounded or captured and four colors . Reisch 's column also failed to destroy the bridges across the Danube . 
- Napoleon 's lightning campaign exposed the Austrian indecisive command structure and poor supply apparatus . Mack completely misread the French dispositions and scattered his forces ; as the French defeated each unit separately , the surviving Austrians withdrew toward the Ulm fortifications . Napoleon arrived to take personal command of close to 80 @,@ 000 men . At Ulm on 16 October Karl Mack surrendered his encircled army of 20 @,@ 000 infantry and 3 @,@ 273 cavalry . The officers were released on the condition that they not serve against France until formally exchanged for French officers captured by the Austrians , an agreement to which they held . 
- 
- = = Prelude to battle = = 
- 
- The few Austrian corps not trapped at Ulm withdrew toward Vienna , with the French in close pursuit . A Russian army under Gen. Mikhail Kutuzov also maneuvered away from the French , withdrawing to the east . At the Ill river on 22 October it joined with the retreating Austrian corps commanded by Michael von Kienmayer . On 5 November the Coalition forces held a successful rearguard action in Amstetten . On 7 November the Russians arrived in St. Pölten and crossed the Danube river the next day . Late on 9 November they destroyed the bridges across the Danube , holding the last one at the hamlet of Stein , near the village Krems , until the late afternoon . 
- 
- = = = Battlefield = = = 
- 
- To the east of Stein , 2 kilometers ( 1 @.@ 2 mi ) down an old road , lay Krems , with its small population of a few hundred , at the confluence of the stream of that name and the Danube . To the west of Stein the Danube made a large curve , creating a crescent @-@ shaped floodplain between it and the mountains . At the far western end of the floodplain , where the mountains came down almost to the river 's edge , was Dürenstein with its castle , known as Schloss Dürenstein . The castle had served as a prison for Richard I of England in 1193 . In 1645 – 46 , during the Thirty Years War , the Swedes had fortified the castle and then demolished it when they withdrew . It stands at 159 meters ( 522 ft ) , on the highest ridge of a mountain fissured with clefts and pinnacles of granite . Because the mountain was sparsely vegetated , it was difficult to distinguish the ruins from the rocks . Narrow canyons cut through the mountain , and widen into the plain below . Between Dürenstein and Stein , on the flood plain , lay the hamlets of Oberloiben and Unterloiben . Near the hamlets , the Loiben flood plain was at its widest , extending at the most 762 meters ( 2 @,@ 500 ft ) from the base of the Loibenberg mountain to the bank of the river . 
- The region was known for its wine . Since the 15th century the local inhabitants practiced viticulture and the wine producers formed St. Paul Vintners ' Guild in 1447 , the oldest such guild in the German @-@ speaking world . Terraced vineyards extended up the sides of the Krems River until it became a mountain stream and terrain was unsuitable for cultivation . The Loiben plain supported both viticulture and agriculture . As the terrain became steeper , the vines grew in terraces built from the dark Urgestein , primordial rock . From Dürenstein to Krems the river makes its wide curve ; the mountains and the steeply terraced slopes prevent clear line @-@ of @-@ sight between the two towns . 
- 
- = = = Dispositions = = = 
- 
- Napoleon had calculated that Kutuzov would withdraw toward Vienna , expecting reinforcements from Russia ; he envisioned that the armies would engage in a great battle at Vienna , and that this battle would decide the war . Consequently , Napoleon drew divisions from four of the other seven corps of the Grande Armée to create a new VIII Corps . This corps was to secure the north shore of the Danube , block any of the Austrian or Russian groups from reinforcing one another and , more importantly , prevent Kutuzov from crossing the river and escaping to Russia . 
- The new VIII Corps , under the overall command of Édouard Mortier , included three infantry divisions and a division of cavalry ( see Order of Battle below ) . Corps Mortier , as it was known , crossed the Danube at Linz and Passau in early November 1805 and marched east , on the north bank of the Danube . Operating independently , the corp 's cavalry conducted reconnaissance ahead of them and on the flanks . Gen. Gazan 's division ( about 6 @,@ 000 men ) took the lead ; Mortier was with them . They were followed by Gen. Dupont 's division ( another 4 @,@ 000 ) about one day 's march behind . Jean @-@ Baptiste Dumonceau 's division ( another 4 @,@ 000 ) , marching another day behind Dupont , brought up the rear . A flotilla of 50 boats acquired at Passau provided communications across the Danube . Before sending Mortier on his mission , Napoleon instructed him to protect his north flank at all times against possible Russian reinforcements , advice he reiterated in subsequent written orders . Napoleon also advised Mortier to secure all crossings of the Danube between Linz and Vienna . 
- On 9 November Gazan 's division reached Marbach an der Donau and covered the 50 kilometers ( 31 mi ) to Dürenstein by early on the following afternoon . Here it skirmished with some Russian patrols to the east of the town and expelled them . Feeling confident , the French established a forward post just upstream from Stein . In Dürenstein itself Mortier set up his command post and directed the establishment of a small field hospital . Although the position seemed secure , he had ignored Napoleon 's strict instructions and neglected to protect his left ( north ) flank . 
- This failure was an important factor when Mortier lost his corps ' so @-@ called " eyes " : after he and Gazan had crossed the Danube , the French dragoons had veered to the northwest , leaving only three squadrons of the 4th Dragoons available for reconnaissance . These had left the division and were operating independently of Gazan 's command . Consequently , Mortier and Gazan marched blindly through the narrow canyon west of Dürenstein , not knowing what lay ahead of them . Kutuzov had led the Coalition army across the Danube at Krems , a short distance past Stein , and destroyed the bridge behind him . His actions deprived the French commanders of a possible route across the Danube , putting the deployment of the entire French division at further risk in the case of retreat . In this decision Kutuzov abandoned Vienna to the French , who were converging on the Austrian capital from the north , west and southwest , for the security of uniting with reinforcements from Galicia . Kutuzov chose a military solution over a political one . 
- Unknown to either Gazan or Mortier , the Coalition had concentrated a force of approximately 24 @,@ 000 men ( mostly Russians and a few Austrians ) within a few kilometers of the French position at Dürenstein . In comparison , Gazan 's division had only 6 @,@ 000 men . The Austro @-@ Russian force was a mixture of infantry , Jägers ( usually deployed as skirmishers ) , Russian musketeers and Russian and Austrian cavalry , accompanied by more than 68 artillery pieces . Kutuzov , who had learned the military arts under the tutelage of the legendary Russian Generalissimo Suvorov , had overall command . The Russian cavalry , units of the greatly feared Cossacks , were well @-@ suited for patrolling the river bank ; indeed , on 9 November they had taken 40 French soldiers as prisoners . Furthermore , reinforcements stood in Moravia , less than two weeks ' march away . If the main body of the French army crossed the river , they would require time to prepare . Kutuzov would have ample warning of any large @-@ scale French movement . 
- After the afternoon 's initial skirmishing with the French , Kutuzov held a council of war on the evening of 10 November at Melk , at the great abbey there . He knew several things . First , he knew the positions of the French from prisoners his Cossacks had captured . He also knew that Gazan had crossed at Linz and was well ahead of any French reinforcements : Dupont had crossed at Passau and , by 10 November , stood at Marbach , 50 kilometers ( 31 mi ) upstream , and Dumonceau was another 7 kilometers ( 4 mi ) further behind him . Kutuzov knew the size of the French force--its division strength--and its positions , and he knew that most of the dragoons were not covering the French flank but had turned north . He also knew , or had made a good supposition , about Napoleon 's orders , so he knew what to offer Mortier and Gazan as bait . 
- 
- = = = Battle plan = = = 
- 
- In addition to the Russian generals , the council included Austrian commanders Lieutenant Field Marshal Johann Heinrich von Schmitt and Friedrich Karl Wilhelm , Fürst zu Hohenlohe . Schmitt , who had retired from the military in 1800 , had been recalled into service after the Ulm debacle and had come to Kutuzov highly recommended by the Emperor . He was an experienced tactician and strategist and had served in a variety of posts in the Habsburg military ; he had been Archduke Charles ' trusted adviser during the campaigns from 1796 to 1800 and had assisted in planning several of Charles ' victories . Upon his recall , Schmitt was appointed Chief of the Quartermaster General Staff of the Coalition Army . The generals had found among the Austrian force one Capt. Christoph Freiherr von Stiebar ( 1753 – 1824 ) , who had knowledge of the local geography . 
- Together , Schmitt , Kutuzov and the other generals , with von Stiebar 's advice on the local terrain , concocted a plan to encircle the French at Dürenstein . Russian commander Mikhail Andreyevich Miloradovich would approach Gazan 's division from the east , supported by Petr Bagration 's corps , and pin the French in place . Three additional columns , commanded by Dmitry Dokhturov ( Doctorov ) , Maj. Gen. Strik and Schmitt , would outflank the French from the west and the north . They would offer , as bait , a rumor : the Russian army was retreating into Moravia and only a rearguard would be left at Krems . 
- 
- = = Battle = = 
- 
- On the night of 10 – 11 November a Russian column under Strik 's command began its passage through the narrow canyons , intent on arriving at Dürenstein by noon ; two more columns , under Dokhtorov and Schmitt , moved in wider semicircles , planning to pass through the mountains and attack the French , who were extended along the river bank . According to the plan , in late morning Strik 's column would emerge from the mountains first and launch a flanking assault on the French right . This flanking attack , combined with Miloradovich 's frontal assault from Stein , would force the French into a vise ; encircled , they would have no option but to surrender--or die . To ensure the success of the plan , the second and third columns , under Dokhtorov and Schmitt , would arrive in early and mid @-@ afternoon and support the earlier assaults . In this way , even if the French tried to retreat west to Marbach , they would not escape the vise @-@ like grip of the Coalition army . 
- Mortier accepted the bait of a rumored Russian retreat . In the early morning of 11 November he and Gazan departed from Dürenstein to seize Stein and Krems , presuming the Russians had either abandoned the settlements or left only a small rear @-@ guard behind . As they approached Stein , a column of Miloradovich 's troops attacked the French forward positions . Thinking this force was the rumored Russian rear guard , Mortier ordered Gen. Gazan to counterattack and push east towards the town of Stein . Fighting spread though the villages of Oberloiben , Unterloiben and the farm at Rothenhof . Instead of withdrawing , as a rear guard would , more and more Russian troops appeared and engaged the French column . 
- Initially Gazan made rapid progress , but he quickly recognized that the opposing force was much stronger than the typical rear guard of a retreating army . Realizing he had been duped and that Gazan 's troops were tiring rapidly , Mortier sent orders to Dupont 's division to hurry forward . By mid @-@ morning the French momentum had stalled ; Mortier committed most of his remaining forces to driving Miloradovich back , leaving a single battalion--perhaps 300 troops--to cover his northern flank , and sent the rest to attack the Russian right . Within 30 minutes he achieved the superiority of numbers he sought . His 4 @,@ 500 French opposed 2 @,@ 600 Russians and forced them back toward Stein while pressing an attack along the river . Miloradovich had no option , for neither Strik 's nor Dokhtorov 's flanking columns were to be seen . 
- At this stage of the battle fighting paused . Mortier and Gazan waited for Dupont 's arrival while Kutuzov and Miloradovich waited for Strik 's and Dokhturov 's . Schmitt 's column was expected to be the last to join the fight because it had to march the greatest distance . The timing of the respite varies , depending on whose reports are consulted : fighting paused at around 12 : 00 or 14 : 00 . Strik arrived first and immediately assaulted Gazan 's line with three battalions , pushing the French out of Dürenstein . Caught between two strong forces , Gazan attempted to push his way back through Dürenstein , to reach the river where the flotilla could evacuate his exhausted troops . Withdrawing through the narrow Danube canyon and fighting off the Russian force at their rear , Gazan and his division were trapped when more of Strik 's Russians appeared to block their retreat . The narrow defiles hampered the Russians ; Strik 's men had to march out of the canyons , form ranks and attack in waves . Despite Strik 's continuous assault in the next two to three hours , Mortier and Gazan pushed the Russians back up the narrow fissure in the hillside . At this point , Dokhturov 's column appeared behind the French line and joined the battle . The French were outnumbered more than three to one , assaulted in the front by Miloradovich 's column , in the middle by Strik 's and in the rear by Dokhturov . 
- Earlier in the morning Dupont had proceeded with his column south and east along the river , from Marbach , according to instructions . Even before the arrival of Mortier 's courier , he heard the sound of artillery in the distance and sent riders ahead to discover the cause . They came back to report that a Russian column ( Dokhturov 's ) was descending from the mountains to take the road to Dürenstein . Realizing this would separate him from the forward division , Dupont hustled his troops toward the sound of battle and deployed them to take the Russians in the flank . The French assault , heralded by cannon fire , caused Dokhturov 's troops to turn their attention from Gazan 's beleaguered force to face these new assailants . Although superior in numbers , Dokhturov 's column had no supporting artillery , and the narrow space prevented them from taking advantage of their size . It was Dokhturov 's turn to face attackers at his front and rear , until the arrival of Schmitt 's column , which had wended its way through the mountains in the west . 
- Schmitt arrived at dusk , and the action continued well after dark ; in mid @-@ November night falls at close to 17 : 00 in the upper Danube climes . Despite the darkness , Schmitt descended out of the defiles and deployed his troops to assail Dupont 's flank . As his Russians entered the fray , they came between a battalion of French and another of Russians . With the additional force , the French were overwhelmed , but most of the shooting subsided when the combatants could not tell apart friend from foe in the dark . Under the cover of darkness , Mortier used the French flotilla to evacuate his exhausted troops to the south bank . The French and Russians continued to skirmish fitfully into the night as sentries encountered one another in the dark . Portions of Gazan 's force provided any necessary rear guard action , and the following morning the remaining men were evacuated from the north shore of the Danube , while they maintained possession of only Spitz and Weissenkirchen on the north bank . 
- 
- = = = Losses = = = 
- 
- The losses were staggering : Gazan lost close to 40 percent of his division to death and wounds . Aside from losing five guns , 47 officers and 895 men under his command were captured , bringing the loss of effectives closer to 60 percent ; furthermore , he lost the eagles of the 4th Infantry Regiment ( France ) and the eagle and guidon of the 4th Dragoons . The Russians lost around 4 @,@ 000 , about 16 percent of their force , and two regimental colors . The Austrian Lieutenant Field Marshal Schmitt was killed as the battle concluded , probably by Russian musketry in the confused melee . The vineyards and the villages of Ober- and Unterloiben were destroyed , as was most of Dürenstein and Stein . Krems was heavily damaged ; the French plundered the town at least twice , and " barbarously handled " its inhabitants . 
- 
- = = Aftermath = = 
- 
- Both sides claimed victory . Although losses were fairly equal in terms of numbers--4,000 wounded or dead on each side--the Coalition forces went into battle with 24 @,@ 000 men while the French started with Gazan 's division of about 6 @,@ 000 , which grew close to 8 @,@ 000 when Dupont 's men joined the fighting in the afternoon . Regardless , Gazan 's division was nearly destroyed ; the 30 percent losses experienced by the French fell predominantly on his division . Clearly for both sides , the fighting was hard . The weather had been cold ; an early storm had left slick icy mud in the roadways , and icicles " like chandeliers " hung from the trees . 
- For the Coalition , the Russians were secure on the north bank of the Danube , awaiting reinforcements from Galicia ; the bridges between Linz and Vienna had been destroyed , making French access to the Austrian capital more difficult , but not impossible . After six months of fighting in which the Austrians had enjoyed little good news , the Coalition could claim a difficult and timely victory . The French had retreated from the field with a badly mauled division and Kutuzov had secured the right flank . Indeed , Francis was so pleased with the outcome at Dürenstein that he awarded Kutuzov the Military Order of Maria Theresa . 
- For the French , the survival of the Corps Mortier seemed nothing short of a miracle . The remainder of Gazan 's division crossed the river the next morning and eventually recuperated in Vienna , which the French acquired by deception later in the month . More importantly for them , the French force had performed well over difficult terrain and under terrible combat conditions . Initially there had been some panic and parts of at least one French battalion had tried to escape on the flotilla craft . They had lost control of the boats in the current and smashed into the pillars of the burned bridge at Krems , overturning their boats . Tossed into the icy river , most had drowned . Despite this initial panic , Gazan 's column retained its cohesion , and responded well to various difficult demands . Dupont had demonstrated his tactical acumen : when he heard cannon fire , he directed his troops toward it to support the French division . In terms of French staffing , Mortier 's failure to guard his flank , especially in the face of Napoleon 's direct advice , adversely influenced his relationship with his commander . However , in the immediate weeks ahead , the flamboyant Murat did more to annoy Napoleon than Mortier had . In assessing the battle and its aftermath , historians have laid the blame and credit for its outcome not only on Mortier and Gazan : " Napoleon , aware of Mortier 's danger and his own culpability for it , vented his frustration on Murat , whom he unjustly accused of abandoning Mortier for the empty glory of riding through Vienna . " 
- After the victory at Austerlitz , Napoleon dispersed the VIII Corps and reassigned Mortier . However disappointed he may have been with Mortier , Napoleon was pleased with Gazan 's performance . As recognition of his conduct in what the French called " the immortal Battle of Dürenstein " , Gazan received the Officer 's Grand Cross of the Legion of Honor . 
- The loss of Schmitt was a significant blow to the Austrian military organization . Called out of retirement for this specific task , he was one of their most experienced general staff officers , other than the Archduke Charles . From the summer of 1796 until his retirement in 1800 he had been Chief of the Quartermaster General Staff of the Army , the Lower Rhine , the Rhine and the Army of Germany . Furthermore , he was a trusted member of Archduke Charles ' staff . He had helped to design several of Charles ' more important victories at Emmendingen , Schliengen , the sieges at Kehl and Hünigen , the battles at Ostrach and Stockach , and the northern Swiss Campaign of 1799 that included battles at Winterthur and Zürich . An experienced officer and excellent tactician , he might well have made a more effective Chief of the Quartermaster General Staff of the Coalition Army at the Battle of Austerlitz than his eventual replacement , Franz von Weyrother . In Schmitt 's absence Weyrother , the architect of the Austrian catastrophe at Hohenlinden in 1800 , was chosen to develop the general battle plan of Coalition action at Austerlitz . Schmitt , undoubtedly a far better tactician than Weyrother , and possessed of superior training and mapping skills , would have developed a more realistic Coalition plan for Austerlitz . Schmitt 's presence would probably not have been enough to turn that defeat into a victory , but it would have mitigated the magnitude of the Coalition 's losses ; Austerlitz was considered one of Napoleon 's finest triumphs . 
- In the broader picture , despite the important major naval engagements , the outcome of the War of the Third Coalition was determined on the Continent , predominantly in the two major land operations . In the Ulm campaign , the Habsburgs achieved some minor victories , such as Klenau 's at Haslach @-@ Jungingen , but ultimately lost an entire army and an officer corps . The latter would not resume arms against France until formally exchanged . This condition crippled the Austrian military leadership and forced the recall of such pensioners as Schmitt out of retirement . After the capitulation at Ulm , isolated portions of the Austrian military evaded capture and joined with their Russian allies ; Michael von Kienmayer 's corps slipped out of the encirclement and joined Kutuzov 's force . A few other small forces refused to capitulate and seemingly melted into the Bavarian mountains and the Thurgingian forests , to reappear in Bohemia for Austerlitz . Sixteen hundred cavalry , including Archduke Ferdinand and Prince Schwarzenberg , broke out of Ulm before its capitulation . Maximilian , Count of Merveldt , led his column back through the mountains into Austria , fighting rear guard actions against pursuing French forces at the Steyer ( Steyr ) and Mariazell . These elusive units were insufficient to balance heavy losses at key battles in which the Austrians could not hold their own against the French . Between the Ulm capitulation and the Austrian and Russian defeat at Austerlitz , there were other minor achievements : a successful skirmish between the cavalry that escaped from Ulm and the French near the town of Nördlingen , the contested victory at Dürenstein , and another within days at Schöngrabern . 
- The second determining event , the decisive French victory at the Battle of Austerlitz over the combined Russian and Austrian armies , forced the Austrian withdrawal from the Coalition . The subsequent Peace of Pressburg , signed on 26 December 1805 , reinforced the earlier treaties of Campo Formio and Lunéville . Furthermore , Austria ceded land to Napoleon 's German allies , and paid an indemnity of 40 million francs . Victory at Austerlitz also gave Napoleon the latitude to create a buffer zone of German states between France and the states of Prussia , Russia , and Austria . These measures did not establish a lasting peace on the continent . Prussian worries about growing French influence in Central Europe sparked the War of the Fourth Coalition in 1806 , in which Austria did not participate . 
- 
- = = Battlefield commemorations = = 
- 
- Until 1805 , Dürenstein was probably best known as the village in which crusader Richard the Lionheart was held by Leopold V , Duke of Austria . In 1741 , during the War of the Austrian Succession , several hundred local villagers had held off the French and Bavarian armies , intent on capturing Vienna , by painting drain pipes to look like cannons , and beating on drums , thus suggesting the presence of a large force . 
- After 1805 , the exploits of 40 @,@ 000 French , Russian , and Austrian soldiers excited the European imagination . General Schmitt 's grave has never been found , but in 1811 a monument for him was erected at the Stein Tor , the gate leading from the old village of Krems to the hamlet of Stein . The house in which Captain von Stiebar lived was marked with a bronze plate commemorating his contribution to the battle . In 1840 , a Spanish lithographer created an image of the battle , which was later expanded by English lithographer John Outhwaite . The image depicts the evacuation of French troops via the Danube flotilla ( see Infobox image ) on a moonlit night . In fact , the moon was in its last quarter phase 48 hours later , and on 11 November probably did not provide as much light as depicted in the image . 
- In 1836 , Jean Antoine Siméon Fort ( French , 1793 – 1861 ) , a historical painter , created a watercolor of the battle , Combat de Dürnstein le 11 novembre 1805 ( ( English ) Battle of Dürenstein of 11 November 1805 ) , which is in the Trianon collection at Versailles . 
- In the Russian novel War and Peace , Leo Tolstoy devoted several pages to the battle , its prelude , and its aftermath , and the delivery of its news to the Tsar by Prince Andrew . Between Dürenstein and Rossatz , at the edge of the Loiben plain , stands the " Little Frenchman " memorial ( see image ) erected in 1905 to commemorate the battle ; it bears the names of Mortier , Gazan , Kutuzov , Schmitt , and others on a copper @-@ engraved plate . 
- 
- = = Orders of battle = = 
- 
- 
- = = = French VIII . Corps ( Corps Mortier ) = = = 
- 
- On 6 November , Édouard Adolphe Mortier commanded the following forces : 
- 1st Division under command of Pierre Dupont de l 'Étang ( formerly 1st Division of VI . Corps ) , six battalions , three squadrons , and three guns , most of which were involved in the fighting after mid @-@ day . 
- 2nd Division under command of Honoré Théodore Maxime Gazan de la Peyrière ( formerly 2nd Division of the V. Corps ) , nine battalions , three squadrons , three guns . 
- 3rd Division under command of Jean @-@ Baptiste Dumonceau ( Batavian Division , formerly 3rd Division of the II . Corps ) . The 3rd Division was not involved in the fighting . 
- Dragoon Division under command of Louis Klein . Klein 's division included the 1st , 2nd , 4th , and 14th Regiments of Dragoons . They were not involved in the fighting . 
- Danube fleet of fifty boats , under the command of Frigate Captain Lostange . 
- Total : fifteen battalions , six squadrons , six guns , approximately 12 @,@ 000 men , not all of which were involved in the fighting . 
- 
- = = = Coalition columns = = = 
- 
- First Column , commanded by General of Brigade Prince Pyotr Ivanovich Bagration , included three battalions of infantry , three grenadier battalions , and three Jäger battalions , ten squadrons of Hussars . 
- Second Column , Lieutenant General Essen , included six battalions of infantry , three battalions of grenadiers , and five squadrons of Hussars . 
- Third Column , commanded by Lieutenant General Dokhturov , including six battalions of infantry , one battalion from the 8th Jäger regiment , and ten squadrons of the Hussar Regiment Mariupol . 
- Fourth Column , commanded by Lieutenant General Schepelev , nine battalions of infantry . 
- Fifth Column , Lieutenant General Freiherr von Maltitz , nine battalions of infantry . 
- Sixth Column , Lieutenant General Freiherr von Rosen , with six battalions of Infantry and ten squadrons of cavalry . The Sixth Column did not take part in the fighting . 
- Austrian Infantry Brigade , Major General Johann Nepomuk von Nostitz @-@ Rieneck , four battalions of Border Infantry , including the highly decorated 9th Regiment Peterwardeiner . 
- Austrian Cavalry Division , Lieutenant Field Marshal Friedrich Karl Wilhelm , Fürst zu Hohenlohe , twenty @-@ two squadrons of cavalry . 
- Total : fifty @-@ eight battalions , sixty @-@ two squadrons , fourteen artillery batteries , approximately 24 @,@ 000 men and 168 guns . 
- 
- 
- = Brock Lesnar = 
- 
- Brock Edward Lesnar / ˈlɛznər / ( born July 12 , 1977 ) is an American Canadian professional wrestler , mixed martial artist , and former amateur wrestler and professional American football player . He is currently signed to WWE on the Raw brand . He is a four @-@ time WWE ( World Heavyweight ) Champion , a former UFC Heavyweight Champion , and an NCAA Division I Heavyweight Wrestling Champion . He is also a one @-@ time IWGP Heavyweight Champion , making him a five @-@ time world champion in professional wrestling . As of July 14 , 2016 , he is # 8 in official UFC heavyweight rankings . 
- After his successful amateur wrestling career at Bismarck State College and the University of Minnesota ( 106 wins and 5 losses ) , Lesnar signed with WWE ( then the World Wrestling Federation ) in 2000 . He was assigned to its developmental promotion Ohio Valley Wrestling ( OVW ) , where he was a three @-@ time OVW Southern Tag Team Champion with Shelton Benjamin . After debuting on WWE 's main roster in 2002 , he won the WWE Championship on three separate occasions with victories over The Rock and Kurt Angle ( twice ) . Lesnar won his first WWE Undisputed Championship five months after his main roster debut at the age of 25 , becoming the youngest champion in the title 's history . He was also the 2002 King of the Ring and the 2003 Royal Rumble winner , making him the youngest King of the Ring and Royal Rumble winner as well . Following his match with Goldberg at WrestleMania XX , Lesnar left the WWE and pursued a career in the National Football League ( NFL ) . He was named a defensive tackle for the Minnesota Vikings but was cut prior to the start of the 2004 – 05 season . In 2005 , Lesnar returned to professional wrestling and signed with New Japan Pro Wrestling ( NJPW ) , where he won the IWGP Heavyweight Championship in his first match . After a contractual dispute with NJPW , he also wrestled as IWGP Heavyweight Champion in the Inoki Genome Federation ( IGF ) . 
- In 2006 , Lesnar pursued a career in mixed martial arts . He signed with Hero 's and won his first fight , against Min @-@ Soo Kim , in June 2007 . He then signed with the Ultimate Fighting Championship ( UFC ) the following October . Lesnar lost in his UFC debut against Frank Mir and then won his second fight against Heath Herring . In November 2008 , Lesnar defeated Randy Couture to become the UFC Heavyweight Champion . Shortly after a successful title defense in a rematch with Mir , Lesnar was sidelined due to diverticulitis . He would return at UFC 116 to defeat Interim UFC Heavyweight Champion Shane Carwin and unify the heavyweight championships , becoming the Undisputed Heavyweight Champion . Lesnar then lost the championship to Cain Velasquez at UFC 121 . In 2011 , he was once again sidelined due to diverticulitis and underwent surgery . Lesnar returned at UFC 141 in December , losing to Alistair Overeem and promptly retiring from MMA . Lesnar was a box office sensation in UFC . He took part in a few of the best selling pay @-@ per @-@ views in UFC history , including UFC 100 and UFC 200 . 
- In April 2012 , Lesnar once again returned to professional wrestling , rejoining WWE after an eight @-@ year hiatus . Two years later , at WrestleMania XXX , Lesnar defeated The Undertaker to end his undefeated streak at the premier annual event . Lesnar has been managed by Paul Heyman throughout the majority of his professional wrestling career . He has headlined numerous pay @-@ per @-@ view events for both the WWE and UFC , including WrestleMania XIX , WrestleMania 31 , UFC 100 , and UFC 116 . In 2015 , an ESPN.com article referred to Lesnar as " the most accomplished athlete in pro wrestling history " . 
- 
- = = Early life = = 
- 
- Lesnar was born in Webster , South Dakota , on July 12 , 1977 . He was raised on a Webster dairy farm owned by his parents , Stephanie and Richard Lesnar . He is of German descent . He has two older brothers named Troy and Chad , and a younger sister named Brandi . At age 17 , he joined the National Guard , where he was assigned to an office job after his red @-@ green colorblindness was deemed hazardous to his desire to work with explosives . He lost this job after failing a computer typing test , and later worked for a construction company . 
- Lesnar attended Webster High School , where he played football and competed in amateur wrestling , placing third in the state championships his senior year . He then attended Bismarck State College , where he won the National Junior College Athletic Association ( NJCAA ) heavyweight wrestling championship in his sophomore year . He transferred to the University of Minnesota on a wrestling scholarship for his junior and senior college years . There , he was roommates with future WWE colleague Shelton Benjamin , who was also his assistant coach . 
- Lesnar won the 2000 National Collegiate Athletic Association ( NCAA ) Division I heavyweight wrestling championship his senior year after being the runner @-@ up to Stephen Neal the year prior . He finished his amateur career as a two @-@ time NJCAA All @-@ American , the 1998 NJCAA Heavyweight Champion , two @-@ time NCAA All @-@ American , two @-@ time Big Ten Conference Champion , and the 2000 NCAA Heavyweight Champion , with a record of 106 – 5 overall in four years of college . 
- 
- = = Professional wrestling career = = 
- 
- 
- = = = World Wrestling Federation / Entertainment = = = 
- 
- 
- = = = = Training and debut ( 2000 – 2002 ) = = = = 
- 
- In 2000 , Lesnar signed with the World Wrestling Federation ( WWF ) . He was sent to its developmental territory , Ohio Valley Wrestling . There , Lesnar first met future friend and manager Paul Heyman . He formed a tag team known as " The Minnesota Stretching Crew " with his former college roommate , Shelton Benjamin . Lesnar and Benjamin won the OVW Southern Tag Team Championship on three separate occasions . Lesnar wrestled several dark matches in 2001 and 2002 before being called up to the main roster . 
- Lesnar debuted on WWF television on the March 18 , 2002 , episode of Raw , coming through the crowd and attacking Al Snow , Maven and Spike Dudley during their match . He was accompanied by Paul Heyman , who was seen giving instructions to Lesnar . When the brand extension was introduced in the WWF , Lesnar was drafted to the Raw brand . Later , Heyman was confirmed to be Lesnar 's agent and gave Lesnar the nickname " The Next Big Thing " . Lesnar 's first feud was with the Hardy Boyz . Lesnar and Jeff Hardy squared off at Backlash in Lesnar 's first official televised match . He won the match by knockout after Hardy was unable to respond to referee Theodore Long . The next night on Raw , Lesnar faced off against Jeff Hardy 's brother , Matt Hardy , and defeated him in the same fashion . 
- 
- = = = = WWE Championship reigns ( 2002 – 2004 ) = = = = 
- 
- In June 2002 , Lesnar won the King of the Ring tournament , defeating Bubba Ray Dudley in the first round , Booker T in the quarter @-@ finals , Test in the semi @-@ finals , and Rob Van Dam in the finals , earning him a shot at the WWE Undisputed Championship at SummerSlam . On July 22 , Lesnar joined the SmackDown ! brand . After a quick feud with Hollywood Hulk Hogan in August 2002 , Lesnar defeated WWE Undisputed Champion , The Rock at SummerSlam to become the WWE Undisputed Champion and youngest WWE Champion at age 25 , a record previously held by The Rock . He also became the second fastest wrestler to win the WWE Championship since his debut ( 126 days ) behind only Ric Flair ( 113 days ) . At the time , the Undisputed WWE Championship was being defended on both brands , so Raw General Manager Eric Bischoff expected Lesnar to return to Raw the following night . However , SmackDown ! General Manager Stephanie McMahon announced that Lesnar was only required to defend the title on SmackDown ! , forcing Bischoff to institute a new championship for Raw ( the World Heavyweight Championship ) . The WWE Undisputed Championship was then renamed the WWE Championship . 
- Lesnar 's rapid rise to the top of WWE in 2002 led to a feud with The Undertaker , which involved a match at Unforgiven . The match ended in a double disqualification resulting in Lesnar retaining the title . Lesnar faced The Undertaker again , at No Mercy , this time in a Hell in a Cell match . Leading up to the match , in the storyline , Lesnar broke the Undertaker 's hand with a propane tank . Despite Heyman begging McMahon not to let The Undertaker use his cast as a weapon , the request was denied and the match went on as planned . In a match that saw both wrestlers and even Heyman covered in blood , it ended when Lesnar reversed an attempted Tombstone Piledriver into his finishing F @-@ 5 maneuver for the win . Six days after his Hell in a Cell match with The Undertaker , Lesnar successfully retained his WWE title in a handicap match with Heyman at the Rebellion pay @-@ per @-@ view against Edge . 
- Lesnar 's next opponent was Big Show . Heyman was convinced more than anyone that Lesnar could not win , trying to talk him out of defending the title . Lesnar refused and faced Big Show in Madison Square Garden at the Survivor Series pay @-@ per @-@ view . Towards the end of the match , Lesnar delivered an F @-@ 5 to Big Show , but when he went for the pin , Heyman pulled the referee out of the ring . This allowed Big Show to capitalize and proceeded to chokeslam Lesnar on a steel chair . Show went on to pin Lesnar and win the title . This loss was Lesnar 's first pinfall loss in WWE and led Lesnar to turn for the first time in his career into a fan favorite . Following Survivor Series , Heyman made it clear that Lesnar would not get a rematch , and had snuck a special clause saying so into his contract . In order to gain his revenge on Big Show and Heyman , Lesnar interfered in his first title defense , which came against Kurt Angle the next month at Armageddon . Lesnar hit the F @-@ 5 on the champion , which enabled Angle to pin him and win the title . On the following episode of SmackDown ! , however , Angle introduced Heyman as his manager and , despite promising Lesnar a title shot earlier in the evening , declared that Lesnar still would not get it . Lesnar was beaten down by Big Show and Angle after the main event , but would get his revenge after the show went off the air . He eventually knocked the Big Show out with a steel chair , leaving Angle alone with Lesnar . Lesnar then chased the champion out of the ring and resumed his assault that culminated when Lesnar used the F @-@ 5 to propel Angle 's right knee into the steel ringpost . As paramedics tended to a screaming Angle , Lesnar finished off the assault with a kneebreaker on the ringside barricade , breaking the champion 's leg in storyline . 
- With Angle temporarily put out of action , Lesnar 's rivalry with Heyman and the Big Show resumed , which culminated in a match at the Royal Rumble in January 2003 with the winner being placed into the Royal Rumble later in the evening . Lesnar would defeat Big Show and entered the Royal Rumble as the # 29 entry , the second to last competitor to enter the match . He eliminated Matt Hardy and The World 's Greatest Tag Team , which was composed of Charlie Haas and his former OVW teammate , Shelton Benjamin , who were mentored by Angle . Lesnar would then eliminate The Undertaker last and win the Royal Rumble , which guaranteed him a title match at WrestleMania XIX . 
- After the Royal Rumble , Lesnar and Chris Benoit faced off against Angle , Haas , and Benjamin at No Way Out the following month and Lesnar 's team won the match . During the match at WrestleMania , Lesnar botched a shooting star press , a move he 'd done numerous times in developmental matches , and jammed his head and neck . This stunned Lesnar and forced Angle and Lesnar to improvise the finish of the match . Lesnar would defeat Angle , after delivering an F @-@ 5 , to win his second WWE Championship . Lesnar was diagnosed with a legitimate concussion from the botched shooting star press . 
- After WrestleMania , Lesnar turned his attention to John Cena , who had returned from injury in February 2003 and who had been F @-@ 5'd into a ringpost in the same manner Angle had been . Cena claimed Lesnar nearly ended his career and even named his new finishing move the " F.U. " as a jab at the new champion . The feud ended in a match at Backlash when Lesnar defeated Cena . On the following episode of SmackDown ! , Lesnar returned to his rivalry with Big Show after he injured Rey Mysterio badly during their match at Backlash . Show 's attack resulted in Mysterio being carried out on a stretcher and back board and Big Show took Mysterio off the stretcher and swung the back board into the ringpost , compounding the injury . Lesnar called out the Big Show , who demanded that Lesnar put his title on the line against him . This led to a stretcher match at Judgment Day for the title . Lesnar successfully retained his title with help from Rey Mysterio and a forklift . During the scripted rivalry , on SmackDown , Lesnar lifted Big Show off the top @-@ rope in a superplex which caused the ring to collapse on impact . 
- As Lesnar and Big Show continued their rivalry , Kurt Angle returned from his neck surgery and he and Lesnar began to form a more friendly rivalry , as the two were allies yet contenders for the title . At the first ever SmackDown brand @-@ exclusive pay @-@ per @-@ view in July , Vengeance , Lesnar took on Angle and Big Show in a No Disqualification triple threat match for his title , which ended after Angle hit the Angle Slam on both Big Show and Lesnar , pinning the champion to become WWE Champion for a fourth time . 
- Lesnar continued to aggressively pursue the WWE title despite his friendship with Angle . Mr. McMahon found his way into the angle , at first berating Lesnar , who had involved himself in McMahon 's rivalry with Zach Gowen , for losing to Angle . This all turned out to be a swerve that came into focus on the August 7 , 2003 SmackDown ! in Kelowna , British Columbia . That night , Lesnar and McMahon were to face each other in a steel cage match with Angle as the special guest referee as per McMahon 's orders on the previous week 's program . During the match , Lesnar had passed out due to a staged backstage incident and McMahon was set to pin him , but Angle refused to allow McMahon to win that way . As the two men began to argue , Lesnar rose to his feet , revealing the ruse to the crowd , and F @-@ 5'd Angle . He then brutally beat Angle while McMahon watched , and celebrated with him afterwards , turning heel in the process . At SummerSlam , Lesnar lost to Angle when Angle made Lesnar tap out to the ankle lock . After that , Lesnar would cement his heel turn by brutalizing smaller wrestlers and attacking his rivals on a more consistent basis . He returned to using the F @-@ 5 to propel his opponents ' legs into the ringpost , as he did to Spanky and Gowen , and interfered in Angle 's matches on two separate occasions . On the September 18 , 2003 episode of SmackDown ! , Lesnar received his third shot at Angle in as many months when he faced the champion in a sixty @-@ minute Iron Man match for the title . Lesnar won the match and his third WWE Championship by a final count of five to four . 
- Lesnar successfully defended his newly won title against the debuting Paul London on October 9 edition of SmackDown ! . He returned to feuding with The Undertaker . Lesnar had previously cost Undertaker the title in a match against then @-@ champion Kurt Angle , which granted him a shot at Lesnar 's title . At No Mercy , Lesnar defeated Undertaker in a Biker Chain match . The rivalry then came to an end after The Undertaker chose to focus on Mr. McMahon . 
- After Paul Heyman returned to WWE as general manager of SmackDown ! , Lesnar aligned himself with his former manager . With Survivor Series coming up , Lesnar decided to challenge Angle to a traditional Survivor Series elimination tag team match . Lesnar chose Big Show as his first teammate , with Heyman adding a returning Nathan Jones and a debuting Matt Morgan to bring the team number to four . Angle chose Chris Benoit and The APA to join his team . However , Faarooq was injured during a match with Lesnar and Angle 's team was forced to find a replacement for him . Lesnar 's team picked A @-@ Train to fill the fifth and final spot for them after he attacked John Cena , who refused to accept an invitation to join Lesnar 's team . Cena instead joined Angle 's team , and Angle added Hardcore Holly as the fifth member ; Lesnar had injured Holly the year before and he hadn 't wrestled since . In the climax of the match , Chris Benoit became the only second wrestler to make Lesnar tap out . Lesnar faced Benoit in a singles bout two weeks later for the WWE Championship on SmackDown ! , where Lesnar won after Benoit passed out to Lesnar 's debuting Brock Lock submission hold . 
- The Survivor Series event marked the first time Lesnar met Goldberg from the Raw brand . After Lesnar claimed in a backstage interview that he could beat anybody in the world , Goldberg interrupted the interview and introduced himself to Lesnar , shaking hands with him before leaving with a staredown . Lesnar followed this rivalry with a feud involving Hardcore Holly . In the storyline , Holly wanted revenge on Lesnar for legitimately injuring his neck during a previous match between the two in 2002 which left Holly in need of neck surgery and out of action for a year . At the Royal Rumble in 2004 , Lesnar defeated Holly to retain the WWE Championship . Later in the Royal Rumble match , Lesnar attacked Goldberg and delivered an F @-@ 5 , enabling Kurt Angle to eliminate him . 
- 
- = = = = Final storylines and departure ( 2004 ) = = = = 
- 
- In February , Lesnar faced Eddie Guerrero for the WWE title at No Way Out . Late in the match , Goldberg delivered a spear to Lesnar while the ref was unconscious . Afterwards , Guerrero went to pin Lesnar but Lesnar kicked out at two . Lesnar then attempted to F @-@ 5 Guerrero but Guerrero reversed it into a DDT . Guerrero then hit a frog splash ; pinning Lesnar to win the WWE Championship . An angry Lesnar then began feuding with Goldberg , blaming him for losing his title , and a match was set up between the two at WrestleMania XX . During the feud with Goldberg , Lesnar was also at odds with Stone Cold Steve Austin , who was shown suggesting to Goldberg that he attack Lesnar at No Way Out . After Lesnar attacked Austin on Raw and stole his four @-@ wheeler , Austin was inserted as the special guest referee for the WrestleMania match . On the March 4 episode of SmackDown , Lesnar defeated Hardcore Holly in his last match on a weekly WWE televised show . Behind the scenes , it was widely known that the match was Goldberg 's last in WWE . Only a week before WrestleMania , rumors surfaced that Lesnar , too , was leaving to pursue a career in the National Football League ( NFL ) . As a result , Lesnar 's match with Goldberg became a fiasco as the fans at Madison Square Garden jeered and heckled both wrestlers vociferously . Goldberg gained victory after delivering a Jackhammer to Lesnar and both men subsequently received Stone Cold Stunners from Austin . 
- 
- = = = New Japan Pro Wrestling ( 2005 – 2007 ) = = = 
- 
- On October 8 , 2005 , Lesnar won the IWGP Heavyweight Championship on his debut match in a three @-@ way match with Kazuyuki Fujita and Masahiro Chono at a New Japan Pro Wrestling ( NJPW ) show in the Tokyo Dome . Lesnar is one of the few American wrestlers to have held this title . He won the match by pinning Masahiro Chono after an F @-@ 5 , which he had renamed the Verdict since WWE owns the trademark on the F @-@ 5 name . After the match , Lesnar stated that this name was referring to his lawsuit against WWE . 
- On December 6 , WWE filed a motion for a temporary restraining order to prevent Lesnar from continuing to work with NJPW , but the court did not grant it . Following that , he had two non @-@ title victories against Manabu Nakanishi and Yuji Nagata . Lesnar successfully defended his championship on January 4 , 2006 , against former champion Shinsuke Nakamura . On January 13 , WWE once again filed an injunction against Lesnar to stop him from defending the IWGP Heavyweight Championship which was also not enforced as he went on to retain his championship against former Sumo Wrestling Grand Champion Akebono on March 19 , at the Sumo Hall . Lesnar had another successful title defense against Giant Bernard on May 3 , 2006 . This was the first American vs. American title match in NJPW since Vader vs. Stan Hansen in 1990 . 
- On July 15 , 2006 , New Japan Pro Wrestling announced Lesnar would not return to defend the IWGP Heavyweight Championship due to " visa issues " and had been stripped of the title . A tournament was held on July 16 to determine the new champion which was won by Hiroshi Tanahashi . Lesnar continued to possess the physical IWGP Championship belt until late June 2007 . 
- Approximately one year later on June 29 , 2007 , Lesnar defended his IWGP Heavyweight Championship against TNA World Heavyweight Champion Kurt Angle in a champion versus champion match . Inoki Genome Federation promoter Antonio Inoki had stated Lesnar was the " proper " IWGP Heavyweight Champion as he was not defeated for the title . Angle would defeat Lesnar by forcing him to tap out to the Angle lock to win the IWGP Heavyweight Championship as recognized by IGF and Total Nonstop Action Wrestling ( TNA ) . This was Lesnar 's last match as a professional wrestler until 2012 , when he re @-@ signed with WWE . 
- 
- = = = = Lawsuit = = = = 
- 
- Lesnar had previously signed a non @-@ compete clause in order to be released from his contract with WWE , which prohibited him from working for any other professional wrestling companies before June 2010 . However , he decided to challenge this ruling in court . WWE responded with a counterclaim after Lesnar breached the agreement by appearing at a New Japan Pro Wrestling show in 2004 . In July 2005 , the two sides dropped their claims and entered negotiations to renew their relationship . WWE had offered Lesnar a contract , but on August 2 , 2005 , WWE 's official website reported that Lesnar had withdrawn from any involvement with the company . The lawsuit began to enter settlement talks on September 21 , but did not get solved . 
- On January 14 , 2006 , Judge Christopher Droney stated that unless WWE gave him a good argument between then and the 25th , he would rule in favor of Lesnar , giving him a summary judgment . This would have enabled Lesnar to work anywhere , immediately . WWE was later granted a deadline postponement . On April 24 , WWE announced on WWE.com that both parties had reached a settlement . On June 12 , a federal judge dismissed the case at the request of both legal parties . 
- 
- = = = Return to WWE = = = 
- 
- 
- = = = = Feud with Triple H and ending the Streak ( 2012 – 2014 ) = = = = 
- 
- Lesnar returned to the WWE on April 2 , 2012 , on Raw , as a heel by confronting and delivering an F @-@ 5 to John Cena . The following week on Raw , general manager John Laurinaitis revealed that he signed Lesnar to bring " legitimacy " back to the WWE and become the " new face of the WWE " . Laurinaitis also announced that Lesnar would face Cena at Extreme Rules with the Extreme Rules stipulation later added to the match . Lesnar was dominant throughout the match until Cena punched Lesnar in the face with a steel chain wrapped around his fist . Cena then delivered the Attitude Adjustment to Lesnar onto steel steps and Lesnar lost the match . 
- The following night on Raw , WWE 's Chief Operating Officer Triple H refused to give in to Lesnar 's unreasonable contract demands ( which included being given his own personal jet and having Raw renamed to Monday Night Raw Starring Brock Lesnar ) , resulting in Lesnar attacking him and breaking his arm with a Kimura lock in storyline . The next week on Raw , Paul Heyman made his return as Lesnar 's legal representative and claimed that Lesnar was quitting WWE . He later announced a lawsuit against WWE for breach of contract . At No Way Out in June , Triple H challenged Lesnar , who was not present , to a match at SummerSlam which Lesnar refused . Stephanie McMahon would later goad Heyman into accepting the match on Lesnar 's behalf on July 23 at Raw 1000 . On August 19 , at SummerSlam , Lesnar defeated Triple H in a No Disqualification match via submission after once again breaking his arm in storyline . The following night on Raw , Lesnar declared himself the new " King of Kings " and said that he would depart from WWE after his victory over Triple H , stating that he had conquered everything in the company . 
- Lesnar returned on the January 28 , 2013 episode of Raw , confronting Vince McMahon who was about to fire Heyman , and Despite Heyman 's pleas , Lesnar hit McMahon with an F @-@ 5 , breaking McMahon 's pelvis in storyline . The following week , during The Miz 's MizTV talk show , Raw Managing Supervisor Vickie Guerrero revealed herself as the one who signed Lesnar to a new contract to impress McMahon . On the February 25 episode of Raw , Lesnar once again attempted to attack McMahon , only to get into a brawl with the returning Triple H , which resulted in Lesnar legitimately having his head split open and requiring eighteen stitches . The following week , Triple H issued a challenge to Lesnar , requesting a rematch with him at WrestleMania 29 , which Lesnar accepted but only after Triple signed a contract and Lesnar named the stipulation . The following week , after Triple H signed the contract and assaulted Heyman , the stipulation was revealed as No Holds Barred with Triple H 's career on the line . Lesnar ended up losing the match after Triple H hit him with a Pedigree onto the steel steps . On the April 15 episode of Raw , Lesnar attacked 3MB ( Heath Slater , Drew McIntyre , and Jinder Mahal ) before Heyman challenged Triple H to face Lesnar in a steel cage match at Extreme Rules , which Triple H accepted the following week . On May 19 at the pay @-@ per @-@ view , after interference from Heyman , he defeated Triple H and ended their feud . Lesnar returned on the June 17 episode of Raw , attacking Heyman 's fellow client CM Punk with an F @-@ 5 . Despite the accusations from Punk , Heyman claimed that he was not behind Lesnar 's attack on him . However , in July , Heyman turned on Punk , and claimed that Punk could not beat Lesnar , which led to Lesnar making his return and attacking Punk on the July 15 episode of Raw . The following week on Raw , Punk challenged Lesnar to a match at SummerSlam , where Lesnar defeated Punk in a no disqualification match . 
- On the December 30 episode of Raw , Lesnar returned with Heyman to announce his intentions to challenge the winner of the upcoming WWE World Heavyweight Championship match between Randy Orton and John Cena at the Royal Rumble . Lesnar then dared any wrestler who disapproved of that notion to challenge him , which was answered by Mark Henry , and a brawl would ensue , ending with Lesnar delivering an F @-@ 5 to Henry . The following week on Raw , Henry challenged Lesnar again , only to have Lesnar dislocate his elbow with the Kimura lock in storyline , and this led Big Show to came out afterwards to confront Lesnar , thus starting a feud which was settled at Royal Rumble , where Lesnar defeated the Big Show after attacking him with a steel chair before the match began . On the February 24 , 2014 episode of Raw , after Heyman stated that Lesnar had requested a match for the WWE World Heavyweight Championship at WrestleMania XXX , instead receiving an open contract to face anyone else of his choosing , The Undertaker then returned and chokeslammed Lesnar through a table , setting up their match at WrestleMania . Lesnar defeated Undertaker after executing three F @-@ 5s , ending his undefeated WrestleMania streak at 21 , a feat that was described by Sports Illustrated as being " the most shocking result since the Montreal Screwjob " . 
- 
- = = = = WWE World Heavyweight Champion ( 2014 – 2015 ) = = = = 
- 
- On the July 21 , 2014 episode of Raw , Triple H announced that Lesnar would face John Cena at SummerSlam for the WWE World Heavyweight Championship . At SummerSlam , Lesnar defeated Cena to become the WWE World Heavyweight Champion , and during the match he delivered sixteen suplexes ( most of which were German suplexes ) and two F @-@ 5s to Cena , who barely managed any offense . On the August 19 episode of Main Event , Triple H announced that Cena was invoking his championship rematch clause against Lesnar at Night of Champions , where Lesnar was disqualified due to Seth Rollins interfering , but retained his championship , which could not be lost via disqualification . Later in the year , after Rollins reunited with The Authority , he was added to Lesnar and Cena 's championship match at Royal Rumble , making it a triple threat match , which Lesnar won despite suffering a storyline broken rib during the match . 
- Lesnar 's next challenger was Roman Reigns , who won the Royal Rumble match to earn the right to face Lesnar for the title at WrestleMania 31 . During his main @-@ event match against Reigns , Lesnar delivered multiple suplexes and was heard exclaiming , " Suplex City , bitch ! " , and thereafter " Suplex City " became one of his signature catchphrases and merchandise motifs . After Lesnar and Reigns traded a few false finishes , Rollins cashed in his Money in the Bank contract while the match was in progress , making it a triple threat ; Rollins then pinned Reigns to win the title . The following night on Raw , Lesnar tried to invoke his rematch clause and subsequently attacked commentators Booker T , John " Bradshaw " Layfield , and Michael Cole , as well as a cameraman after Rollins refused the rematch , which led to Stephanie McMahon suspending Lesnar indefinitely in storyline . 
- Lesnar returned on the June 15 episode of Raw as a fan favorite , being chosen by The Authority as the number one contender to Rollins ' WWE World Heavyweight Championship at Battleground . On July 4 , Lesnar made his first non @-@ televised wrestling appearance for WWE since his 2012 return , defeating Kofi Kingston at The Beast in the East live event in Tokyo in a quick winning effort ; he also delivered F @-@ 5s to Kingston 's New Day stablemates Big E and Xavier Woods after the match . On July 19 at Battleground , Lesnar dominated Rollins , delivering 13 suplexes ; mid @-@ pinfall , after performing an F @-@ 5 , he was attacked by The Undertaker , who incapacitated Lesnar with a chokeslam and two Tombstone Piledrivers ; this ended the match , with Lesnar winning by disqualification and Rollins retaining the championship . 
- 
- = = = = Various feuds and storylines ( 2015 – present ) = = = = 
- 
- The following night on Raw , Undertaker explained that he had attacked Lesnar not for ending his WrestleMania streak , but rather for Lesnar allowing Heyman to constantly taunt Undertaker about it ; this led to the two brawling throughout the arena and a WrestleMania rematch being scheduled for SummerSlam on August 23 , where Undertaker would controversially defeat Lesnar ; during the match , Undertaker tapped out to a Kimura lock by Lesnar and the timekeeper rang the bell , but the referee did not see the tapout and demanded that the match continue , which saw Undertaker then hitting Lesnar with a low blow and Lesnar passed out to Undertaker 's submission hold , Hell 's Gate . The following night on Raw , Lesnar and Heyman challenged Undertaker to an immediate rematch , only to be confronted by Bo Dallas , who mocked Lesnar about his defeat ; Lesnar then responded by delivering 3 German suplexes and an F @-@ 5 to Dallas . 
- During Night of Champions , it was announced that Lesnar would face The Undertaker at the Hell in a Cell pay @-@ per @-@ view , where Lesnar defeated The Undertaker after a low blow and F @-@ 5 onto the exposed ring floor , ending their feud . The match was later voted " Match of the Year " during the 2015 Slammy Awards . On the January 11 episode of Raw , Lesnar returned , attacking The New Day , The League of Nations ( Sheamus , King Barrett , Rusev , and Alberto Del Rio ) , and Kevin Owens , before giving an F @-@ 5 to Roman Reigns . The following week on Raw , Lesnar would brawl with Reigns until they were both attacked by The Wyatt Family . At the Royal Rumble , Lesnar was the 23rd entrant , eliminating four competitors before being eliminated by Bray Wyatt with help from the rest of The Wyatt Family . 
- On the January 25 episode of Raw , Stephanie McMahon announced that the main event of Fastlane would be a triple threat match between Lesnar , Roman Reigns , and Dean Ambrose to determine who would face Triple H for the WWE World Heavyweight Championship at WrestleMania 32 . In the following weeks , Lesnar would be continuously provoked by Ambrose , with Reigns saving him from the subsequent attacks by Lesnar . At Fastlane , Lesnar dominated most of the match before he was put through two announce tables by Ambrose and Reigns ; he would ultimately lose the match after Reigns pinned Ambrose . Because of this , Lesnar attacked Ambrose in the parking lot as he was arriving at the arena for Raw , but Ambrose would return later in the night , having hijacked an ambulance , and he challenged Lesnar to a No Holds Barred Street Fight match at WrestleMania 32 , where Lesnar defeated Ambrose . On the July 7 edition of SmackDown , it was announced that Lesnar would be facing the returning Randy Orton at SummerSlam . On July 19 , at the 2016 WWE Draft , Lesnar was drafted to Raw . 
- 
- = = Football career = = 
- 
- After his match at WrestleMania XX , Lesnar sidelined his career in WWE to pursue a career in the National Football League ( NFL ) despite not playing American football since high school . The WWE issued this statement on their official website , WWE.com , following his departure : 
- Brock Lesnar has made a personal decision to put his WWE career on hold to prepare to tryout for the National Football League this season . Brock has wrestled his entire professional career in the WWE and we are proud of his accomplishments and wish him the best in his new endeavor . 
- Lesnar later told a Minnesota radio show that he had " three wonderful years " in WWE , but had grown unhappy and always wanted to play pro football , adding that he did not want to be 40 years old and wondering if he could have " made it " in football . In an interview about the NFL , he stated : 
- This is no load of bull ; it 's no WWE stunt . I am dead serious about this . I ain 't afraid of anything and I ain 't afraid of anybody . I 've been an underdog in athletics since I was five . I got zero college offers for wrestling . Now people say I can 't play football , that it 's a joke . I say I can . I 'm as good an athlete as a lot of guys in the NFL , if not better . I 've always had to fight for everything . I wasn 't the best technician in amateur wrestling but I was strong , had great conditioning , and a hard head . Nobody could break me . As long as I have that , I don 't give a damn what anybody else thinks . 
- Lesnar had a great showing at the NFL Combine . On April 17 , 2004 , a minivan collided with his motorbike ; he suffered a broken jaw and left hand , a bruised pelvis , and a pulled groin . Several NFL teams expressed interest in watching Lesnar work out . The Minnesota Vikings worked out Lesnar on June 11 , 2004 but he was hampered by the groin injury suffered in the April motorcycle accident . On July 24 it was reported that he was nearly recovered from his groin injury . He signed with the Vikings on July 27 and played in several preseason games for the team . He was released by the Vikings on August 30 , 2004 . Lesnar received an invitation to play as a representative for the Vikings in NFL Europa but declined due to his desire to stay in the United States with his family . He had several football cards produced of him during his time with the Vikings . 
- 
- = = Mixed martial arts career = = 
- 
- 
- = = = Hero 's ( 2007 ) = = = 
- 
- On April 29 , 2006 , after the final match of the K @-@ 1 World Grand Prix 2006 in Las Vegas , Lesnar announced his intent to join K @-@ 1 's mixed martial arts league , Hero 's . He trained with Minnesota Martial Arts Academy under Greg Nelson and Minnesota Assistant Head wrestling coach Marty Morgan . Lesnar announced on August 12 in Las Vegas that he had signed a deal with the K @-@ 1 promotion . His first fight was scheduled against Choi Hong @-@ man of Korea on June 2 , 2007 , at the K @-@ 1 Dynamite ! ! USA show . However , prior to the match , Hong @-@ Man was replaced by Min Soo Kim . Lesnar submitted Soo Kim with strikes in 1 : 09 of the first round to win his first official MMA match . 
- 
- = = = Ultimate Fighting Championship ( 2008 – 2011 ) = = = 
- 
- During UFC 77 , it was announced that Lesnar had reached a deal to fight with the Ultimate Fighting Championship ( UFC ) . On February 2 , 2008 , Lesnar made his debut with the promotion in an event titled UFC 81 : Breaking Point against former UFC Heavyweight Champion , Frank Mir . Due to his large hands , Lesnar was wearing 4XL gloves for the fight , making him the second man in Nevada 's combat sports history to wear such gloves after Choi Hong @-@ man . Lesnar secured an early takedown and began landing numerous punches but was docked a point after a punch hit Mir on the back of the head . Following another takedown by Lesnar , Mir managed to secure a kneebar and force a submission at 1 : 30 of the first round and Lesnar lost in his UFC debut . At UFC 82 , former UFC Heavyweight Champion and Hall of Famer Mark Coleman was announced to fight Lesnar at UFC 87 . However , Coleman withdrew from the fight due to an injury and was replaced by Heath Herring . Lesnar defeated Herring by unanimous decision . 
- Lesnar would then face Randy Couture for the UFC Heavyweight Championship at UFC 91 on November 15 . Lesnar would beat Couture via a technical knockout ( TKO ) in the second round to become the new UFC Heavyweight Champion . 
- On December 27 , 2008 , at UFC 92 , Frank Mir defeated Antônio Nogueira for the Interim Heavyweight Championship and was to face Lesnar for the Undisputed UFC Heavyweight Championship at UFC 98 . Immediately after winning the Interim Heavyweight title , Mir found Lesnar in the crowd and shouted , " You 've got my belt " . Due to a knee injury to Mir , the title unification match with Lesnar that was originally slated to be the UFC 98 main event was postponed . Lesnar won the postponed rematch with Mir at UFC 100 on July 11 , 2009 , via technical knockout in the second round . The win earned Lesnar Beatdown of the Year honors , with Anderson Silva , from Sherdog for 2009 . During his post @-@ match celebration , Lesnar flipped off the crowd who had been booing him . Lesnar also made a disparaging comment about the pay @-@ per @-@ view 's primary sponsor Bud Light , claiming they " won 't pay me nothin ' " and promoted Coors Light instead . Lesnar later apologized for his remarks at the post @-@ fight press conference , where he held a bottle of Bud Light and endorsed their product . 
- On July 1 , 2009 , it was reported that the winner of the Shane Carwin vs. Cain Velasquez fight at UFC 104 would face Lesnar but the match was scrapped and Lesnar was scheduled to defend his belt against Shane Carwin at UFC 106 on November 21 . On October 26 , 2009 , it was announced that Lesnar pulled out of his Carwin bout due to an illness . UFC President Dana White said Lesnar had been ill for three weeks , claiming he had never been this sick in his life and that it would take him a while to recover ; his fight with Carwin was rescheduled for UFC 108 on January 2 , 2010 . Lesnar initially sought treatment in Canada , but later told reporters that he had received " Third World treatment " at a hospital in Brandon , Manitoba , and that seeking better treatment in the U.S. saved his life . Lesnar went on to criticize Canadian health care further and stated that he shared his experience in an effort to speak " on the behalf of the doctors in the United States that don 't want health care reform to happen " . 
- On November 4 , it was confirmed that Lesnar was suffering from mononucleosis and that his bout with Carwin would have to wait a bit longer and the fight for Lesnar 's heavyweight championship was cancelled . On November 14 , at the UFC 105 post @-@ fight conference , Dana White stated , " [ Lesnar ] ' s not well and he 's not going to be getting well anytime soon " and that an interim title match might need to be set up . In addition to mononucleosis , it was revealed that he was suffering from a serious case of diverticulitis , an intestinal disorder , which required surgery . After further diagnosis , Lesnar underwent surgery on November 16 to close a perforation in his intestine that had been leaking fecal matter into his abdomen , causing pain , abscesses , and overtaxing his immune system to the point that he contracted mononucleosis . From the level of damage to Lesnar 's system , the surgeon estimated that the intestinal condition had been ongoing for around a year . 
- In January 2010 , Lesnar announced on ESPN 's SportsCenter that he was scheduled to make a return to the UFC in the summer . A match between Frank Mir and Shane Carwin took place on March 27 at UFC 111 to determine the Interim Heavyweight Champion , and Lesnar 's next opponent . Shane Carwin defeated Mir via knockout in the first round , becoming the new Interim Champion . After the fight , Lesnar came into the ring and stated , " It was a good fight but he 's wearing a belt that 's a make @-@ believe belt . I 've got the real championship belt " . Lesnar faced Carwin at UFC 116 to unify the heavyweight titles . Early in the first round , Carwin knocked Lesnar down with punch , gave him a cut above his left eye , and used a ground and pound attack the rest of the round . In the next round , Lesnar was able to take Carwin down , attain a full mount , then move into side @-@ control and finish the fight with an arm triangle choke . With the victory , Lesnar became the Undisputed UFC Heavyweight Champion , earning his first Submission of the Night and giving Carwin his first loss . The win also tied a UFC record for most consecutive successful Heavyweight Championship defenses . 
- Lesnar 's next defense was against undefeated top contender Cain Velasquez on October 23 , at the Honda Center in Anaheim , California at UFC 121 . Dana White announced via SportsNation that the UFC would bring back UFC Primetime to hype the fight . He was defeated by Velasquez for the title by TKO in the first round . 
- On January 11 , 2011 , Lesnar was announced as a coach of The Ultimate Fighter Season 13 , opposite Junior dos Santos , with the two expected to fight on June 11 at UFC 131 , however , he was struck with another bout of diverticulitis and had to withdraw from the fight on May 12 . He was replaced by Shane Carwin , who ended up losing against dos Santos . Lesnar underwent surgery on May 27 to help battle his problems with diverticulitis . Dana White said that he had a 12 @-@ inch piece of his colon removed . 
- In its May 2011 issue , ESPN 's magazine published a story listing the highest paid athlete based on base salary and earnings for the most recent calendar year or most recent season in 30 sports . Lesnar topped the list for MMA at $ 5 @.@ 3 million , which included his reported bout salaries and estimated pay @-@ per @-@ view bonuses . 
- In the summer of 2011 , Lesnar announced that he was returning to action , stating , " I feel like a new man . Healthy . Strong . I feel like I used to feel " . His return match was scheduled to be at UFC 141 on December 30 in Las Vegas against former Strikeforce heavyweight champion Alistair Overeem . Overeem won the fight by way of technical knockout in the first round . The result of the fight remains controversial , as Overeem tested positive for elevated levels of testosterone prior to his next fight . Lesnar then announced his retirement from MMA , mentioning his struggles with diverticulitis and saying " tonight was the last time you 'll see me in the octagon " . 
- Speculation about a return to MMA lasted until March 24 , 2015 , when Lesnar announced in an interview on SportsCenter that he had re @-@ signed with WWE and officially closed the door on a return to MMA , even though he was offered a deal " ten times more " than what he had made previously in his MMA career . He further elaborated that , while he was training for months for a return to the UFC , he felt " physically great but something was lacking mentally " . Lesnar added that " [ he 's ] an older caveman now , so [ he ] makes smarter caveman decisions " and that he chose to sign with WWE instead of returning to MMA because he could " work part @-@ time with full @-@ time pay " . 
- 
- = = = UFC 200 ( 2016 ) = = = 
- 
- Though Lesnar said he was " closing the door on MMA " in March 2015 , UFC announced on June 4 , 2016 , that he would return at UFC 200 on July 9 . WWE confirmed it had granted Lesnar " a one @-@ off opportunity " to compete at UFC 200 before he would return to the company for SummerSlam on August 21 . 
- Lesnar , representing Canada for the first time in his career , defeated Mark Hunt by unanimous decision ( 29 @-@ 27 ) , avoiding 19 of Hunt 's 30 attempted standing strikes , and taking him down four times to land 43 significant ground strikes , 32 in the final round . He also won a UFC @-@ record $ 2 @.@ 5 million purse . 
- On July 15 , Lesnar was notified of a potential anti @-@ doping policy violation by the United States Anti @-@ Doping Agency ( USADA ) stemming from an undisclosed banned substance in an out @-@ of @-@ competition sample collected on June 28 . Shortly after this was announced , WWE said Lesnar 's match with Randy Orton at SummerSlam would still take place . Hunt told UFC to either pay him half of Lesnar 's purse or let him out of his contract . He later changed his mind on Twitter , and asked for the whole purse . UFC has not yet publicly responded to Hunt . Lesnar told the Associated Press , " We will get to the bottom of this . " On July 19 , the UFC announced that a second sample taken in @-@ competition on July 9 tested positive for the same banned substance discovered in the previous out @-@ of @-@ competition sample . 
- 
- = = = UFC pay @-@ per @-@ views = = = 
- 
- 
- = = Personal life = = 
- 
- Lesnar married Rena Greek , better known as Sable , on May 6 , 2006 . They reside on a farm in Maryfield , Saskatchewan , having previously lived in Maple Plain , Minnesota . They have two sons : Turk ( born June 3 , 2009 ) and Duke ( born July 21 , 2010 ) . Lesnar also has twins with his former fiancée , Nicole McClain , the first being a son named Luke ( born Brock Jr . ) and the second a daughter named Mya Lynn ( born April 10 , 2002 ) . Mya is 10 minutes older than Luke . Lesnar has full custody of the twins , and is the stepfather of Mariah , Greek 's daughter with her late husband . 
- Lesnar is a conservative and a supporter of the Republican Party . He is a member of the National Rifle Association , and made an appearance at their annual meeting in May 2011 to discuss his passion for hunting and his role as a spokesman for Fusion Ammunition . 
- During his first run in WWE , Lesnar developed addictions to both alcohol and painkillers , allegedly drinking a bottle of vodka per day and taking hundreds of Vicodin pills per month to manage the pain caused by wear and tear on his body ; he named his accident at WrestleMania XIX as a particular source of pain . Lesnar claims that , as a result of his addiction and mental exhaustion , he does not remember " an entire two years " of his WWE career . 
- In January 2001 , Lesnar was arrested by police in Louisville , Kentucky for suspicion of possessing large amounts of anabolic steroids . The charges were dropped when it was discovered that the substances were a legal growth hormone . His lawyer described it as a " vitamin type of thing " . 
- On December 15 , 2011 , Lesnar was charged with hunting infractions on a trip to Alberta on November 19 , 2010 . Two charges were dropped , but Lesnar pleaded guilty to the charge of improper tagging of an animal . He was fined $ 1 @,@ 725 and given a six @-@ month hunting suspension . 
- As of July 2016 , Lesnar 's eldest son , Brock Jr. is ranked # 1 in Saskatchewan and # 4 in all of Canada in amateur wrestling . 
- 
- = = Other media = = 
- 
- Lesnar appears in the video games WWE SmackDown ! Shut Your Mouth , WWE SmackDown ! Here Comes the Pain , Madden NFL 06 , UFC 2009 Undisputed , UFC Undisputed 2010 , WWE ' 12 , WWE ' 13 , WWE 2K14 , WWE 2K15 , WWE 2K16 , and WWE 2K17 . 
- In 2003 , WWE Home Video released a DVD chronicling Lesnar 's career entitled Brock Lesnar : Here Comes the Pain . It was re @-@ released in 2012 as a three @-@ disc DVD and two @-@ disc Blu @-@ ray collector 's edition to tie in with Lesnar 's WWE return . It was also expanded to include new matches and interviews . 
- Lesnar was featured on the covers of Flex and Muscle & Fitness magazine in 2004 , and Minneapolis ' City Pages in 2008 . 
- In 2009 , Lesnar signed an endorsement deal with Dymatize Nutrition . A CD containing footage of Lesnar training was included with Dymatize 's " Xpand " product . 
- In 2011 , Lesnar published an autobiography titled Death Clutch : My Story of Determination , Domination , and Survival ( ISBN 978 @-@ 0062023117 ) . It was co @-@ written with Paul Heyman . 
- In a 2013 post on his blog , Attack on Titan author Hajime Isayama revealed that he drew inspiration from Lesnar for the character of the Armored Titan . 
- Lesnar has also appeared in multiple comedic Instagram and Vine videos by actor Eric Stonestreet . 
- 
- = = Filmography = = 
- 
- 
- = = = Television = = = 
- 
- 
- = = = Video games = = = 
- 
- 
- = = In wrestling = = 
- 
- Finishing moves 
- Brock Lock ( Over @-@ the @-@ shoulder single leg Boston crab ) – 2002 – 2004 
- F @-@ 5 ( WWE ) / Verdict ( NJPW / IGF ) ( Fireman 's carry facebuster ) – 2002 – 2006 ; 2012 – present 
- Kimura lock – 2012 – present 
- Shooting star press – OVW ; only used once in WWE 
- Signature moves 
- Backbreaker 
- Fallaway slam 
- Gorilla press slam 
- Knee lifts to the opponent 's midsection 
- Multiple suplex variations 
- Belly @-@ to @-@ back , sometimes to two opponents at once 
- Fisherman 's , sometimes while delaying 
- Overhead belly @-@ to @-@ belly , sometimes into or out of the ring 
- Release / Rolling German 
- Snap 
- Vertical 
- Multiple turnbuckle thrusts 
- Powerslam 
- Rear naked choke 
- Running powerbomb 
- Standing double leg takedown followed by mounted punches 
- Triple non @-@ release powerbomb 
- Managers 
- Mr. McMahon 
- Sable 
- Paul Heyman 
- Nicknames 
- " The Anomaly " 
- " The Beast ( Incarnate ) " 
- " The Conqueror " 
- " The Freak " 
- " The Next Big Thing " 
- " The One in 21 – 1 / 22 @-@ 1 " 
- " The Nightmare of Suplex City " 
- Entrance themes 
- Ultimate Fighting Championship 
- " Enter Sandman " by Metallica 
- " Nickel Size Hail ( And the Damaging Winds ) " by Sunny Ledfurd 
- World Wrestling Entertainment / WWE 
- " Enforcer " by Jim Johnston ( April 8 , 2002 – June 3 , 2002 ) 
- " Next Big Thing " by Jim Johnston ( June 10 , 2002 – March 14 , 2004 ; April 2 , 2012 – August 20 , 2012 ) 
- " Next Big Thing ( Remix ) " by Jim Johnston ( January 28 , 2013 – present ) 
- 
- = = Mixed martial arts record = = 
- 
- 
- = = Championships , awards , and honors = = 
- 
- 
- = = = Collegiate wrestling = = = 
- 
- National Collegiate Athletic Association 
- NCAA Division I All @-@ American ( 1999 , 2000 ) 
- NCAA Division I Heavyweight Champion ( 2000 ) 
- Big Ten Conference Champion ( 1999 , 2000 ) 
- National Junior College Athletic Association 
- NJCAA All @-@ American ( 1997 , 1998 ) 
- NJCAA Heavyweight Champion ( 1998 ) 
- North Dakota State University Bison Tournament Champion ( 1997 – 1999 ) 
- 
- = = = Mixed martial arts = = = 
- 
- Inside Fights 
- Biggest Draw ( 2008 ) 
- Rookie of the Year ( 2008 ) 
- Sherdog Awards 
- Beatdown of the Year ( 2009 ) 
- Sports Illustrated 
- Top Newcomer of the Year ( 2008 ) 
- Ultimate Fighting Championship 
- UFC Heavyweight Championship ( 1 time ) 
- Submission of the Night ( 1 time ) 
- World MMA Awards 
- Breakthrough Fighter of the Year ( 2009 ) 
- Wrestling Observer Newsletter 
- Best Box Office Draw ( 2008 – 2010 ) 
- MMA Most Valuable Fighter ( 2008 – 2010 ) 
- 
- = = = Professional wrestling = = = 
- 
- Guinness World Records 
- World record : Youngest person to win the WWE Championship ( aged 25 years , 44 days ) 
- Inoki Genome Federation 
- IWGP Heavyweight Championship ( 1 time ) 1 
- New Japan Pro Wrestling 
- IWGP Heavyweight Championship ( 1 time ) 1 
- Ohio Valley Wrestling 
- OVW Southern Tag Team Championship ( 3 times ) – with Shelton Benjamin 
- Pro Wrestling Illustrated 
- Feud of the Year ( 2003 ) vs. Kurt Angle 
- Feud of the Year ( 2015 ) vs. The Undertaker 
- Match of the Year ( 2003 ) vs. Kurt Angle in an Iron Man match on SmackDown ! on September 16 
- Most Improved Wrestler of the Year ( 2002 ) 
- Wrestler of the Year ( 2002 , 2014 ) 
- Ranked # 1 of the top 500 singles wrestlers in the PWI 500 in 2003 
- Wrestling Observer Newsletter 
- Best Brawler ( 2003 ) 
- Best Wrestling Maneuver ( 2002 ) F @-@ 5 
- Feud of the Year ( 2003 ) vs. Kurt Angle 
- Most Improved Wrestler ( 2002 , 2003 ) 
- Wrestling Observer Newsletter Hall of Fame ( Class of 2015 ) 
- World Wrestling Entertainment / WWE 
- WWE Championship ( 4 times ) 2 
- King of the Ring ( 2002 ) 
- Royal Rumble ( 2003 ) 
- Slammy Awards ( 5 times ) 
- Hashtag of the Year ( 2015 ) – # SuplexCity 
- Match of the Year ( 2015 ) – vs The Undertaker at Hell in a Cell 
- Rivalry of the Year ( 2015 ) – vs The Undertaker 
- " Tell Me You Didn 't Just Say That " Moment of the Year ( 2015 ) – Coining " Suplex City " at WrestleMania 31 
- The OMG Shocking Moment of the Year ( 2014 ) – Ending The Undertaker 's WrestleMania streak at WrestleMania XXX 
- 1 ^ Lesnar 's IWGP Heavyweight Championship reign at IGF is considered a continuation of his reign from NJPW . 
- 2 ^ When Lesnar won the title for the first time it was known as the WWE Undisputed Championship . His next two reigns were as simply WWE Champion , while his final one was as WWE World Heavyweight Champion . 
- 
- 
- = Constant k filter = 
- 
- Constant k filters , also k @-@ type filters , are a type of electronic filter designed using the image method . They are the original and simplest filters produced by this methodology and consist of a ladder network of identical sections of passive components . Historically , they are the first filters that could approach the ideal filter frequency response to within any prescribed limit with the addition of a sufficient number of sections . However , they are rarely considered for a modern design , the principles behind them having been superseded by other methodologies which are more accurate in their prediction of filter response . 
- 
- = = History = = 
- 
- Constant k filters were invented by George Campbell . He published his work in 1922 , but had clearly invented the filters some time before , as his colleague at AT & T Co , Otto Zobel , was already making improvements to the design at this time . Campbell 's filters were far superior to the simpler single element circuits that had been used previously . Campbell called his filters electric wave filters , but this term later came to mean any filter that passes waves of some frequencies but not others . Many new forms of wave filter were subsequently invented ; an early ( and important ) variation was the m @-@ derived filter by Zobel who coined the term constant k for the Campbell filter in order to distinguish them . 
- The great advantage Campbell 's filters had over the RL circuit and other simple filters of the time was that they could be designed for any desired degree of stop band rejection or steepness of transition between pass band and stop band . It was only necessary to add more filter sections until the desired response was obtained . 
- The filters were designed by Campbell for the purpose of separating multiplexed telephone channels on transmission lines , but their subsequent use has been much more widespread than that . The design techniques used by Campbell have largely been superseded . However , the ladder topology used by Campbell with the constant k is still in use today with implementations of modern filter designs such as the Tchebyscheff filter . Campbell gave constant k designs for low @-@ pass , high @-@ pass and band @-@ pass filters . Band @-@ stop and multiple band filters are also possible . 
- 
- = = Terminology = = 
- 
- Some of the impedance terms and section terms used in this article are pictured in the diagram below . Image theory defines quantities in terms of an infinite cascade of two @-@ port sections , and in the case of the filters being discussed , an infinite ladder network of L @-@ sections . Here " L " should not be confused with the inductance L – in electronic filter topology , " L " refers to the specific filter shape which resembles inverted letter " L " . 
- The sections of the hypothetical infinite filter are made of series elements having impedance 2Z and shunt elements with admittance 2Y . The factor of two is introduced for mathematical convenience , since it is usual to work in terms of half @-@ sections where it disappears . The image impedance of the input and output port of a section will generally not be the same . However , for a mid @-@ series section ( that is , a section from halfway through a series element to halfway through the next series element ) will have the same image impedance on both ports due to symmetry . This image impedance is designated ZiT due to the " T " topology of a mid @-@ series section . Likewise , the image impedance of a mid @-@ shunt section is designated ZiΠ due to the " Π " topology . Half of such a " T " or " Π " section is called a half @-@ section , which is also an L @-@ section but with half the element values of the full L @-@ section . The image impedance of the half @-@ section is dissimilar on the input and output ports : on the side presenting the series element it is equal to the mid @-@ series ZiT , but on the side presenting the shunt element it is equal to the mid @-@ shunt ZiΠ . There are thus two variant ways of using a half @-@ section . 
- Parts of this article or section rely on the reader 's knowledge of the complex impedance representation of capacitors and inductors and on knowledge of the frequency domain representation of signals . 
- 
- = = Derivation = = 
- 
- The building block of constant k filters is the half @-@ section " L " network , composed of a series impedance Z , and a shunt admittance Y. The " k " in " constant k " is the value given by , 
- <formula> 
- Thus , k will have units of impedance , that is , ohms . It is readily apparent that in order for k to be constant , Y must be the dual impedance of Z. A physical interpretation of k can be given by observing that k is the limiting value of Zi as the size of the section ( in terms of values of its components , such as inductances , capacitances , etc . ) approaches zero , while keeping k at its initial value . Thus , k is the characteristic impedance , Z0 , of the transmission line that would be formed by these infinitesimally small sections . It is also the image impedance of the section at resonance , in the case of band @-@ pass filters , or at ω = 0 in the case of low @-@ pass filters . For example , the pictured low @-@ pass half @-@ section has 
- <formula> . 
- Elements L and C can be made arbitrarily small while retaining the same value of k . Z and Y however , are both approaching zero , and from the formulae ( below ) for image impedances , 
- <formula> . 
- 
- = = = Image impedance = = = 
- 
- See also Image impedance # Derivation 
- The image impedances of the section are given by 
- <formula> 
- and 
- <formula> 
- Given that the filter does not contain any resistive elements , the image impedance in the pass band of the filter is purely real and in the stop band it is purely imaginary . For example , for the pictured low @-@ pass half @-@ section , 
- <formula> 
- The transition occurs at a cut @-@ off frequency given by 
- <formula> 
- Below this frequency , the image impedance is real , 
- <formula> 
- Above the cut @-@ off frequency the image impedance is imaginary , 
- <formula> 
- 
- = = = Transmission parameters = = = 
- 
- The transmission parameters for a general constant k half @-@ section are given by 
- <formula> 
- and for a chain of n half @-@ sections 
- <formula> 
- For the low @-@ pass L @-@ shape section , below the cut @-@ off frequency , the transmission parameters are given by 
- <formula> 
- That is , the transmission is lossless in the pass @-@ band with only the phase of the signal changing . Above the cut @-@ off frequency , the transmission parameters are : 
- <formula> 
- 
- = = = Prototype transformations = = = 
- 
- The presented plots of image impedance , attenuation and phase change correspond to a low @-@ pass prototype filter section . The prototype has a cut @-@ off frequency of ωc 
- = 1 rad / s and a nominal impedance k = 
- 1 Ω . This is produced by a filter half @-@ section with inductance L 
- = 1 henry and capacitance C = 
- 1 farad . This prototype can be impedance scaled and frequency scaled to the desired values . The low @-@ pass prototype can also be transformed into high @-@ pass , band @-@ pass or band @-@ stop types by application of suitable frequency transformations . 
- 
- = = Cascading sections = = 
- 
- Several L @-@ shape half @-@ sections may be cascaded to form a composite filter . Like impedance must always face like in these combinations . There are therefore two circuits that can be formed with two identical L @-@ shaped half @-@ sections . Where a port of image impedance ZiT faces another ZiT , the section is called a Π section . Where ZiΠ faces ZiΠ the section so formed is a T section . Further additions of half @-@ sections to either of these section forms a ladder network which may start and end with series or shunt elements . 
- It should be borne in mind that the characteristics of the filter predicted by the image method are only accurate if the section is terminated with its image impedance . This is usually not true of the sections at either end , which are usually terminated with a fixed resistance . The further the section is from the end of the filter , the more accurate the prediction will become , since the effects of the terminating impedances are masked by the intervening sections . 
- 
- 
- = The Snowmen = 
- 
- " The Snowmen " is an episode of the British science fiction television series Doctor Who , first broadcast on Christmas Day 2012 on BBC One . It is the eighth Christmas special since the show 's 2005 revival and the first to be within a series . It was written by head writer and executive producer Steven Moffat and directed by Saul Metzstein . 
- The episode is set in the Victorian era and sees the Doctor ( Matt Smith ) brooding with the assistance of Silurian Madame Vastra ( Neve McIntosh ) , her wife Jenny Flint ( Catrin Stewart ) and Sontaran Strax ( Dan Starkey ) , after the loss of companions Amy Pond and Rory Williams in the previous episode , " The Angels Take Manhattan " . He is forced out of hiding to investigate mysterious , sentient snowmen that are building themselves and meets Clara Oswald ( Jenna @-@ Louise Coleman ) , a governess also investigating the snowmen . They discover that the snowmen are being animated by the Great Intelligence ( voice of Ian McKellen ) with the help of a man named Dr Simeon ( Richard E. Grant ) . 
- Building upon the character 's surprise introduction in " Asylum of the Daleks " , " The Snowmen " introduces Clara as the Doctor 's new companion , though ultimately it would be a third version of her character that would travel with the Doctor starting with " The Bells of Saint John " . In addition to Clara , " The Snowmen " also introduces a redesigned TARDIS , revised title sequence and theme music , and sees changes to the Doctor 's costume . The special was produced in August 2012 , with location filming in Newport , Wales and Bristol . It received final ratings of 9 @.@ 87 million viewers in the UK , becoming the fourth most @-@ watched programme of Christmas Day . " The Snowmen " was met with mostly positive reviews from critics , most of whom received the introduction and character of Clara well . However , some felt that Grant and McKellen were underused as villains or the plot was slight because of the focus on characterisation . 
- 
- = = Plot = = 
- 
- 
- = = = Prequels = = = 
- 
- To promote the special , three prequels were released . The first was broadcast during the 2012 Children in Need telethon on 16 November 2012 , titled " The Great Detective " . The Silurian Madame Vastra , her human wife Jenny Flint , and the Sontaran Strax ( all returning from " A Good Man Goes to War " ) describe a number of strange phenomena to a shadowed fourth detective . The fourth detective reveals himself to be the Doctor , and tells the group that he has retired . 
- A second prequel , titled " Vastra Investigates " , was released online on 17 December 2012 . At the end of a case , Vastra and Jenny converse with an officer from Scotland Yard and apologise for Strax 's violent wishes for the culprit 's punishment . Vastra explains Strax 's alien origin as well as her own to the officer , much to his astonishment . Vastra reveals that she was awoken by an extension to the London Underground and initially disliked humans , though that changed when she fell in love with Jenny . On the carriage ride home , Jenny notices it is beginning to snow and Vastra notes that the snow should be impossible because there are no clouds in the sky . 
- A third prequel , titled " The Battle of Demon 's Run — Two Days Later " was released on the United States iTunes and Amazon Video stores on 25 March 2013 . Two days after the events of " A Good Man Goes to War " , Vastra and Jenny convince Strax that he is not mortally wounded and invite him to accompany them back to 1800s London . The scene had been filmed as an extra due to the anticipation that fans would ask how Strax was resurrected and came to be in Vastra 's employ . 
- 
- = = = Synopsis = = = 
- 
- In 1842 England , a young boy builds a snowman , but refuses to play with the other children . The snowman starts speaking to the boy , repeating his assertions that the other children are silly . Fifty years later , the boy has grown up to be Dr. Walter Simeon , proprietor of the Great Intelligence Institute . He hires men to collect samples of snow , which he places in a large snow @-@ filled globe in his laboratory before feeding the men to a group of animated snowmen . The Doctor , still despondent after losing his former companions Amy Pond and Rory Williams , has parked his TARDIS above Victorian London among the clouds . He uses his allies Vastra , Jenny , and Strax to keep people away from him . They also fill their time investigating mysteries throughout the city . 
- Elsewhere , barmaid Clara Oswin Oswald investigates a disturbance outside the tavern she works at and finds the Doctor walking by . He attempts to leave discreetly , but Clara follows him to a coach . Not wishing to become involved in matters , the Doctor instructs Strax to bring him a memory worm that will erase the last hour of Clara 's memories with just a touch . Before they can do so , they are surrounded by snowmen created from snow with psychic properties who attack the group . The Doctor realises that Clara 's thoughts are creating the snowmen and ends the threat by instructing her to think of them melting . Clara cautions the Doctor that if he wipes her memory , she will forget how to deal with the snowmen . The Doctor reluctantly allows her to go and ascends a staircase to the sky to return to the TARDIS . Clara follows him and knocks on the door , but she hides and flees down the staircase when the Doctor answers . Clara returns to her other job as governess for the children of Captain Latimer . She learns that Latimer 's daughter has been having horrible dreams about their previous governess returning from the dead . Clara realises that the pond that contains the old governess ' body is the only thing still frozen around them . She attempts to track down the Doctor but instead attracts the attention of Jenny , who takes her to see Vastra . Vastra tells Clara she gets only one word to impress the Doctor with if she wants his help . Clara chooses the word " Pond " , which shocks the Doctor and arouses his interest . 
- Acting on a tip from Strax , the Doctor visits the Great Intelligence Institute posing as Sherlock Holmes . He confronts Dr. Simeon and find a large glass globe in Simeon 's office that contains psychic snow . The Doctor speaks to the Great Intelligence , the entity that has been speaking to Dr. Simeon since he was a boy . He learns that the Great Intelligence has been controlling the snowmen and has taken interest in Latimer 's pond . The Doctor visits the pond and deduces that the Great Intelligence is using the old governess ' body as a DNA blueprint to form an ice creature that will retain its form and not melt . While Clara is putting the children to bed , the frozen body of the governess breaks into the house . The Doctor fights her off and is joined by Vastra , Jenny and Strax . Dr. Simeon arrives with more snowmen and tells them he wants the governess ' ice body . The Doctor flees with Clara to the roof of the mansion and then to the TARDIS hovering overhead . They are pursued by the ice governess , whom the Doctor traps under a layer of frozen ice crystals . Inside the TARDIS the Doctor gives Clara a TARDIS key , but the ice governess arrives and pulls Clara down off the cloud . 
- The Doctor picks up Clara and takes her back to Latimer 's mansion , placing her under medical care of Strax . He collects the ice fragments from the governess and places them in a souvenir London Underground biscuit tin . He and Vastra travel to Simeon 's lab , where the Doctor notes the Intelligence 's plan to replace humanity with ice creatures and holds up the tin with the necessary DNA . Dr Simeon grabs the tin and opens it only to find the memory worm , which latches on to him . The Doctor states that the Great Intelligence , which has been existing as a mirror of Dr Simeon 's thoughts , will vanish with the erasure of Simeon 's memories . Instead , the Intelligence reveals that it existed long enough that it can now control Simeon 's body , which it uses to attack Vastra and the Doctor . The influence of the Great Intelligence quickly wanes , and Simeon falls dead . Outside , a salt @-@ water rain has started , and the Doctor sees that another psychic ability has taken control of the snow from the Great Intelligence : the Latimer family , crying for Clara . Strax informs the Doctor upon his return to the Latimer mansion that Clara only has moments left , and she passes away as the Doctor returns the TARDIS key to her . At her funeral , the Doctor reads Clara 's full name on her tombstone and realises she is the woman he met in " Asylum of the Daleks " who became a Dalek , whom he refers to as " Soufflé Girl " . He gleefully announces that a person dying twice is an impossibility and , bidding farewell to his allies for now , the Doctor departs in the TARDIS to investigate and find Clara . The episode concludes in contemporary times , where a young woman resembling Clara walks through the same graveyard , pausing by Clara 's tombstone . 
- 
- = = = Continuity = = = 
- 
- The Second Doctor previously encountered the Great Intelligence in the serials The Abominable Snowmen ( 1967 ) , set in the 1930s , and The Web of Fear ( 1968 ) , set in the 1960s . In these stories , the Great Intelligence uses robot Yeti as its physical presence . The events of The Web of Fear are alluded to by the Doctor in " The Snowmen " when he presents the London Underground biscuit tin to the Great Intelligence in Dr Simeon 's laboratory ; the Intelligence states , " I do not understand these markings " , in reference to the 1967 London Underground map design on the tin . The Doctor remarks that the Underground is a " key strategic weakness in metropolitan living " , referring to ( and possibly setting in motion ) the future Yeti attack on London via the Underground . 
- Coleman previously played Oswin Oswald in " Asylum of the Daleks " , though the connection between the two characters is not clarified until Clara reveals she has an interest in soufflés , a trait that Oswin 's character also had . The Doctor , after meeting Clara , wistfully replies " those were the days " when she asks why he isn 't staying to get acquainted with her , which are the same words he tells Craig Owens ( " Closing Time " ) when Craig comments that the Doctor always wins . The final scenes at the graveyard establish that Clara shares the same name as Oswin , leading the Doctor to surmise they are the same person . As seen on her gravestone , Clara 's birthdate is 23 November , the date Doctor Who was first transmitted in 1963 . 
- 
- = = Production = = 
- 
- 
- = = = Writing and design changes = = = 
- 
- Writer Steven Moffat stated that he wanted an " epic " quality to the Christmas special . The story would also show how the Doctor had responded to losing his previous companions ; Moffat said that " I think he 's probably reached the point in his life where he 's saying , ' Friendship for me is just postponed bereavement — I want to be on my own for a while ' . " Moffat compared the withdrawn Doctor seen at the onset of the episode to the first appearances of the First Doctor ( William Hartnell ) in 1963 and the Ninth Doctor ( Christopher Eccleston ) in 2005 . He also attributed the idea of a retired Doctor to a plot proposed by Douglas Adams in the 1970s , but rejected by the production team at the time . Continuing the theme introduced with the series ' first five episodes , " The Snowmen " was promoted like a movie . A movie poster was released in the Radio Times , showing the Doctor and Clara ascending the ladder to the TARDIS . 
- The episode saw several major design changes for the series . " The Snowmen " is the debut of a redesigned TARDIS interior , as well as a new title sequence and variation of the theme tune . The new title sequence features a brief glimpse of the Doctor 's face , the first time since Survival , the final serial of the classic series in 1989 , that the Doctor 's face has been seen in the title sequence . Moffat had noticed that the TARDIS ' design was getting " progressively whimsical " and resembled more of a " magical place " rather than a machine . It was designed by series production designer Michael Pickwood , who stated that the new interior was also supposed to be " darker and moodier " and provide an easier access to the " gallery " of the ship when shooting . 
- The Doctor also wears a one @-@ off costume , Victorian @-@ themed , which Smith described as " a bit Artful Dodger meets the Doctor " . Moffat described the new outfit as a " progression " as the Doctor was in " a different phase of his life now " and felt more " grown @-@ up " and fatherlike . The costume was designed by Howard Burden for this episode . " The Snowmen " also contains several references to Sherlock Holmes , including the Doctor dressing up as him . Moffat is co @-@ creator of the BBC series Sherlock , for which Smith auditioned for the role of Doctor Watson before being cast as the Doctor . In addition , the incidental music during the scene bears a resemblance to the Sherlock theme . 
- 
- = = = Casting = = = 
- 
- This episode marks the return of Jenna @-@ Louise Coleman , who previously appeared in the series opener , " Asylum of the Daleks " . Coleman was cast because of her chemistry with Matt Smith , and especially because she was able to talk faster than him . She auditioned for the role of Clara , not Oswin from " Asylum " , as the concept of the two characters being the same only occurred to Moffat whilst casting for Clara . The production team requested that the press and fans who attended advanced screenings keep Coleman 's appearance a secret until " Asylum " was broadcast ; the effort was ultimately successful . Moffat stated that the introduction of a new companion made " the show feel different " and brought the story to " a new beginning " with a different person meeting the Doctor . Smith said that Clara was different from her predecessor Amy Pond ( Karen Gillan ) , which allowed the audience to see a different side of the Doctor . Coleman described her as resourceful and not intimidated , citing the reason for following the Doctor at the beginning as pursuing answers . The Clara who would become a travelling companion of the Doctor would not debut until the Spring premiere , " The Bells of Saint John " ; save for a brief cameo at the end of " The Snowmen " . Coleman stated that she played each version as individuals with " trust that there would be a payoff " to her mystery . 
- Also returning to the series are Neve McIntosh as Madame Vastra , Dan Starkey as Strax and Catrin Stewart as Jenny . All three previously appeared in " A Good Man Goes to War " and reprised their roles both in this episode and in the prequels . They returned due to the popularity of Vastra and Jenny ; Moffat considered a spin @-@ off featuring them , though he did not have the time to do it . Instead , he decided to bring them back in the main series . Richard E. Grant had previously played the Doctor on two occasions , as an alternative Tenth Doctor in the spoof charity special Doctor Who and the Curse of Fatal Death , which was written by Moffat and as an alternative Ninth Doctor in the animated story Scream of the Shalka which had been intended to be a continuation of the series before it was revived in 2005 . Smith commented that Grant was " born to be a Who villain . He pitches it on that perfect level and tone " . Grant 's appearance in Doctor Who was teased by the BBC via Twitter , announcing his appearance at midnight 5 August 2012 . Tom Ward was drawn to his role because of the quality of the script , and also stated his young children were pleased that he appeared in the programme . The Great Intelligence was voiced by Sir Ian McKellen . The two children Clara is governess to , Digby and Francesca , were played by real @-@ life brother and sister Joseph and Ellie Darcey @-@ Alden . 
- 
- = = = Filming and effects = = = 
- 
- " The Snowmen " was originally intended to be produced in the fourth production block of the series and be the first episode Coleman shot as her character ; however , it did not begin filming until the week of 6 August 2012 , after Coleman had worked on later episodes while Moffat was writing the Christmas special . The read @-@ through had taken place on 2 August 2012 . This was the first Christmas special to be filmed in BBC Wales ' new Roath Lock studios . Scenes featuring Coleman and several guest stars in a Victorian setting were filmed in Newport , Wales , while Coleman and Smith were also spotted filming in Bristol two weeks later on 21 August . Some scenes which used snow props were filmed in Portland Square , Bristol , where filming took place overnight on 21 – 22 August 2012 . Bristol was chosen because it had Victorian @-@ era architecture . Pickwood stated that his favourite set is the London Street with the back of the pub , which he said was based on a sixteenth @-@ century building in Oxford . The locations were blocked off and sprayed with fake snow . 
- The TARDIS on the cloud was achieved through a mix of fog on the studio floor and post @-@ production special effects . Director Saul Metzstein explained that it was difficult to achieve the desired look for the snowmen ; the first ones he likened to Zippy from Rainbow which was too " cute " of an appearance , and so the effects team created more menacing CGI faces . Clara 's introduction to the TARDIS introduced two novel effects for the show . The first was a single @-@ shot camera tracking from a few feet away from the TARDIS to its interior , with the implication of the TARDIS 's trans @-@ dimensional nature shown to the audience . In the following shot , the camera does a complete circle of the TARDIS console , an effect not seen since the early days of the show . Metzstein wanted to include this shot to further emphasize the " bigger on the inside than the outside " nature of the time machine . 
- In addition to the three prequel mini @-@ episodes , the cast also filmed an additional promotional video , " Songtaran Carols , " which the BBC uploaded during the days leading up to the broadcast . The video featured Starkey singing modified versions of several Christmas songs in character as Strax as his castmates look on , before everyone breaks character and begins laughing . 
- 
- = = Broadcast and reception = = 
- 
- " The Snowmen " aired on BBC One on 25 December 2012 at 5 : 15 pm , the same day on BBC America in the US and Space in Canada and the next day on ABC1 in Australia and on Prime in New Zealand . UK overnight ratings showed that the special had been watched by 7 @.@ 6 million viewers , coming in sixth for the night . Final consolidated figures ( not including BBC iPlayer viewers ) showed that the episode was watched by 9 @.@ 87 million viewers , coming in fourth for the night . It also received an Appreciation Index figure of 87 , higher than most of the Doctor Who Christmas specials . The iPlayer version had 1 @,@ 467 @,@ 220 views , making it the most popular TV show on iPlayer over Christmas . The US airing was seen by 1 @.@ 43 million viewers , with a 0 @.@ 6 rating in the demographic of adults aged 18 – 49 . 
- 
- = = = Critical reception = = = 
- 
- The episode received mostly positive reviews . Dan Martin of The Guardian called it " actually the best Christmas Special since ' The Christmas Invasion ' " and the first to be " actually scary " , with " everything we like " about Doctor Who and Christmas . He praised Coleman 's introduction as Clara and the gang of Vastra , Jenny , and Strax . IGN 's Matt Risley gave " The Snowmen " a score of 9 @.@ 4 out of 10 , describing it as " a rollicking , riveting masterclass in storytelling " which " refreshingly " lacked traditional Christmas references " in favour of some sparkling dialogue , gorgeous set design and fascinating characterisation " . While he felt that Grant and McKellen were underused , he was very positive towards Coleman 's " unpredictable " Clara . Radio Times reviewer Patrick Mulkern was pleased with the return of the Great Intelligence despite an inconsistency in the timeline he found , and praised the " lovely images " and direction of the special , though he felt the variation of the theme music " lacks the menace " of the original . While he was positive towards Clara , he was " unmoved by her death " as it was " plainly silly " that she did not look injured . 
- Nick Setchfield of SFX gave the special four and a half out of five stars , writing that " the power of emotion saves the day again " was appropriate in light of the festivities and many fairytales referenced in the story . Setchfield was positive towards the " terrific " comedy with Strax , Coleman and the " surprisingly underused " Grant , as well as the new title sequence and TARDIS . While he wrote that the subtle callback of the Great Intelligence was " a tad more interesting than the usual ' So , we meet again ! ' schtick " , he ultimately felt their threat " never quite comes into sharp relief " . Neela Debnath of The Independent wrote that " The Snowmen " was stronger than the previous year 's " The Doctor , the Widow , and the Wardrobe " as it was connected to the overall story of the series , but " still has a way to go if it is to live up to ' A Christmas Carol ' " . Despite feeling that it was " enjoyable " , she noted that " the story feels truncated and rushed " 
- The Mirror 's Jon Cooper also praised Coleman and the new side of the Doctor that was shown , comparing it to Rose Tyler ( Billie Piper ) challenging the Ninth Doctor ( Christopher Eccleston ) . However , he felt the character @-@ heavy story was to the detriment of the plot , which was " a classic Who set @-@ up that ultimately suffers from a lack of explanation [ and ] more set @-@ pieces than a coherent whole " . He felt that the episode may not have been accessible for casual viewers , but offered much for fans in time for the programme 's fiftieth anniversary . Dominic Cavendish of The Daily Telegraph gave " The Snowmen " three out of five stars , disappointed that it was not as scary as it had been hyped to be . While he was positive towards Smith and the TARDIS on the cloud , he criticised Strax and the " Sudoku @-@ like complexity " of the script . 
- The episode was nominated for the 2013 Hugo Award for Best Dramatic Presentation ( Short Form ) , alongside " Asylum of the Daleks " and " The Angels Take Manhattan " , but lost to the Game of Thrones episode " Blackwater " . 
- 
- = = DVD release = = 
- 
- " The Snowmen " was initially released as a standalone on DVD and Blu @-@ ray in the UK and North America . It was later included as part of the DVD / Blu @-@ ray box set Doctor Who : The Complete Seventh Series in September 2013 . 
- It has subsequently been reissued in several box set compilations , most recently alongside the Christmas specials between " The Christmas Invasion " and " Last Christmas " inclusive in a boxset titled Doctor Who – The 10 Christmas Specials on 19 October 2015 . 
- 
- = = Soundtrack = = 
- 
- Selected pieces of score from " The Snowmen " and the preceding Christmas special , as composed by Murray Gold , were included on a soundtrack released on 21 October 2013 by Silva Screen Records . 
- 
- 
- = No. 20 Squadron RAAF = 
- 
- No. 20 Squadron is a Royal Australian Air Force ( RAAF ) support squadron . Coming under the control of No. 96 Wing , it is responsible for the management of the airfield at RAAF Base Woomera , South Australia . The squadron originated as a maritime patrol unit during World War II . Raised in August 1941 , it operated PBY Catalina and Short Empire flying boats from bases in New Guinea , Queensland and the Northern Territory , conducting search @-@ and @-@ rescue , mine @-@ laying , anti @-@ submarine and bombing missions against Japanese targets in the Pacific theatre . Following the conclusion of hostilities , the squadron was disbanded in March 1946 . It was reactivated as an airfield support squadron in April 2015 . 
- 
- = = History = = 
- 
- 
- = = = World War II = = = 
- 
- No. 20 Squadron was formed at Port Moresby , New Guinea , on 1 August 1941 for a general reconnaissance role , under the command of Squadron Leader W.N. Gibson . Its establishment was six PBY Catalina flying boats and 133 personnel , but only five aircraft ( all transferred from No. 11 Squadron ) and 55 personnel were available initially . The squadron conducted long @-@ range patrols between bases scattered around the islands to Australia 's north in conjunction with No. 11 Squadron . On 18 November , No. 20 Squadron 's Catalinas were augmented by two Short Empire flying boats transferred from No. 11 Squadron . 
- On 25 November 1941 , following the loss of HMAS Sydney , one of No. 20 Squadron 's Catalinas was despatched to Western Australia to join a No. 11 Squadron Catalina in search @-@ and @-@ rescue missions , but they found only oil slicks . By the outbreak of war in the Pacific , No. 20 Squadron had a strength of six Catalinas and two Empire flying boats . Its personnel at the beginning of December numbered 14 officers and 118 men . The squadron undertook its first sortie of the Pacific War on 8 December ; a Catalina located three Japanese luggers in the vicinity of Thursday Island , Queensland . Later in the month it commenced anti @-@ submarine patrols and , in January 1942 , bombing raids against Japanese bases . As the Japanese advanced into the South West Pacific , No. 20 Squadron was also responsible for evacuating white civilians from areas threatened by invasion . On 21 January , one of its Catalinas located the Japanese fleet steaming for Rabaul and signalled a warning to the town 's Australian defenders before being shot down by anti @-@ aircraft fire ; it was the squadron 's first combat loss . 
- In the wake of the fall of Rabaul , the Catalinas of Nos. 11 and 20 Squadrons became the RAAF 's only offensive weapon against the Japanese . Their raids on Rabaul did little to stem the Japanese advance , and in the following months Port Moresby itself was subjected to increasingly frequent attacks , which destroyed aircraft , facilities , and squadron records . In February 1942 , the Short Empires operated by Nos. 11 and 20 Squadrons were transferred to the newly formed No. 33 ( Transport ) Squadron . No. 20 Squadron lost two Catalinas during patrols on 4 and 6 May ; the nine crewmen of the first were later found to have been captured and beheaded ; the crew of the second were also captured and subsequently disappeared without trace . 
- In response to the threat of invasion at Port Moresby , Nos. 11 and 20 Squadrons moved to Bowen , Queensland , on 7 May 1942 . They were soon attacking Japanese targets in Lae , Salamaua and Rabaul . On 27 June , each squadron contributed an aircraft to a four @-@ hour raid over Lae and Salamaua during which , as well as bombs , the RAAF crews dropped beer bottles to disrupt the enemy soldiers ' sleep — the sound they made falling through the air was , according to the official history , " something between a shrill whistle and a scream " . By 1 July , No. 20 Squadron 's strength was six Catalinas and 175 personnel , out of a planned establishment of nine aircraft and 415 personnel . Its prime responsibility in early 1942 was maritime reconnaissance as far as New Guinea , the Solomon Islands , and New Caledonia ; the latter half of the year saw a greater focus on night bombing . Now comprising 252 officers and men , the squadron relocated to Cairns on 11 November 1942 . From Cairns it continued to conduct reconnaissance , anti @-@ submarine and occasional bombing operations over the waters around New Guinea . Between December 1942 and March 1943 , No. 20 Squadron 's aircraft flew a total of 9 @,@ 629 hours and dropped 227 tons of bombs . The squadron 's role changed in June 1943 when it commenced mine @-@ laying operations over the Netherlands East Indies and the Philippines , though it continued to make some bombing raids and supply drops . 
- In September 1944 , No. 20 Squadron became part of No. 76 Wing RAAF , along with Nos. 42 and 43 Squadrons , and moved to Darwin , Northern Territory . All three squadrons operated Catalinas , their primary purpose being mine @-@ laying . On the night of 30 September , a Catalina of No. 20 Squadron was shot down while attacking a ship at Pomelaa in the Dutch East Indies ; the loss was compounded by the fact that one of the coordinators of the mining campaign , Lieutenant Commander P.E. Carr of the Royal Australian Navy , was aboard the plane and was captured by the Japanese . Another of the squadron 's Catalinas went down on the night of 27 / 28 January 1945 , possibly in a cyclone over the Timor Sea , during the campaign to mine Surabaya . In March , a detachment of four No. 20 Squadron aircraft , along with four from No. 43 Squadron , laid mines off the coast of southern China and Formossa as part of a No. 76 Wing offensive in this area ; these operations were conducted from Leyte Gulf in the Philippines . One of No. 20 Squadron 's Catalinas was lost on the night of 7 / 8 March , most likely owing to bad weather rather than enemy action . Three of the squadron 's aircraft mined the entrance to Hong Kong harbour on 8 April and , on 26 May , four of its Catalinas mined Wenchow harbour in China , the furthest north that any Australian aircraft infiltrated during the war in the Pacific . Three of its aircraft flew the RAAF 's last mine @-@ laying mission on 30 July . 
- No. 20 Squadron 's final wartime sortie was a patrol on 14 August 1945 . Following the end of the war , the squadron operated in the transport role and ferried Australian prisoners of war home from various locations in South East Asia . It relocated to RAAF Station Rathmines , New South Wales , on 21 November . No. 20 Squadron flew its last mission , a transport flight to Balikpapan , on 21 January 1946 , and disbanded at Rathmines on 27 March . 
- 
- = = = Post @-@ war re @-@ establishment = = = 
- 
- No. 20 Squadron was reactivated on 1 April 2015 to support airfield operations at RAAF Base Woomera , South Australia . Consisting of nine uniformed personnel and one Australian Public Service member under the command of Squadron Leader Simon Bartlett , the squadron formed part of No. 96 Wing , a component of Combat Support Group ( CSG ) . The airfield had previously been managed under the auspices of Aerospace Operational Support Group , but a command @-@ and @-@ control review commissioned by the Chief of Air Force recommended that , in common with other RAAF airfields , it should be administered by CSG . RAAF Base Woomera , incorporating Woomera Village , was one of two Air Force units formally established on 12 January 2015 as part of a reorganisation of the Woomera Range Complex , the other unit being RAAF Woomera Test Range . 
- The design of the reactivated squadron 's crest includes a wedge @-@ tailed eagle to denote courage and nobility , a woomera spear thrower to symbolise the town and its indigenous heritage , Sturt 's Desert Pea to represent South Australia , and the Pleiades star cluster , which features in the folklore of the local Kokatha people . 
- 
- 
- = Light Tank Mk VII Tetrarch = 
- 
- The Light Tank Mk VII ( A17 ) , also known as the Tetrarch , was a British light tank produced by Vickers @-@ Armstrongs in the late 1930s and deployed during the World War II . The Tetrarch was originally designed as the latest in the line of light tanks built by the company for the British Army . It improved upon its predecessor , the Mk VIB Light Tank , by introducing the extra firepower of a 2 @-@ pounder gun . The War Office ordered 70 tanks , an order that eventually increased to 220 . Production was delayed by several factors , and as a consequence , only 100 to 177 of the tanks were produced . 
- The tank 's design flaws , combined with the decision by the War Office not to use light tanks in British armoured divisions , ruled out the use of Tetrarchs in the North African Campaign . As a result , the majority of the tanks remained in Britain , although 20 were sent to the USSR as part of the Lend @-@ Lease program . In early 1941 , the Royal Armoured Corps formed three squadrons for use in overseas amphibious operations , one of which was equipped with Tetrarchs . In May 1942 , a small number of Tetrarchs formed part of the British force which participated in the invasion of Madagascar , and , in June 1942 , Tetrarchs were attached to the 1st Airborne Division after it was decided that the design allowed its use as an air @-@ portable light tank to support British airborne forces . The Tetrarchs were transported and landed in specially designed General Aircraft Hamilcar gliders . A lack of gliders prevented their participation in the Allied invasion of Sicily in 1943 ; instead they were attached to the new 6th Airborne Division and became part of the 6th Airborne Armoured Reconnaissance Regiment . 
- The division used approximately 20 Tetrarchs during the British airborne landings in Normandy in June 1944 . The tanks were successfully landed by glider , but they did not perform well . Several were lost in accidents , and those that did see action proved to be inferior in firepower and armour to the armoured fighting vehicles of the German forces . A few days after the beginning of the operation , the tanks were removed from direct engagement with German armour and used only to provide fire support . By August 1944 , most of the Tetrarchs in action were replaced with Cromwell cruiser tanks , and the remainder were replaced by the M22 Locust in December 1944 . 
- Tetrarchs did not see any further combat and were deemed obsolete by 1946 ; the last was retired in 1950 . There were several variations on the Tetrarch design , including the Alecto self @-@ propelled gun and the Light Tank Mk VIII , but none of these were ever used in active service with the British Army . 
- 
- = = Development history = = 
- 
- 
- = = = Initial development = = = 
- 
- The prototype of the Light Tank Mk VII ( A17 ) , nicknamed ' Purdah ' , was first developed in 1937 by Vickers @-@ Armstrongs as a private venture , and was intended to be sold either to the British Army or to foreign militaries . It was to be the latest in a series of light tanks produced by the company . The tank was designed to overcome the shortcomings of insufficient armament in earlier light tanks that were fitted only with machine guns . Vickers @-@ Armstrong installed on the Mk VIIs a 2 @-@ pounder 40 @-@ millimetre ( 1 @.@ 6 in ) main gun paired with a 7 @.@ 92 @-@ millimetre ( 0 @.@ 312 in ) Besa machine gun , and mounted the two guns in a two @-@ man turret . The tank possessed a maximum of 14 millimetres ( 0 @.@ 55 in ) of armour . The prototype weighed approximately 16 @,@ 800 pounds ( 7 @,@ 600 kg ) and was powered by a 165 @-@ horsepower ( 123 kW ) Meadows engine . Suspension was on eight road wheels , four per side , with no separate driver or idler wheels and it was capable of a 40 miles per hour ( 64 km / h ) top speed . The Mk VII design relied on an unusual steering method and a mechanical system incorporated into earlier Vickers models . The front wheels could be steered to allow for gentle turns by bending the tracks . For sharper turns , the system returned to the conventional method of braking one track to turn the tank ; the dual system of turning was designed to lessen mechanical strain on the MkVII and reduce its power wastage . The suspension system was also a new design that relied on struts with pockets of air for springing and cushions of oil for damping , and each of the wheels was independently sprung . 
- The War Office examined the design and put the prototype through a series of trials during May and June 1938 ; the model was tested as a possible " light cruiser " since War Office light tank needs were already met by its predecessor , the Mark VI . The War Office then took the view that the tank was not acceptable as a light cruiser because the Nuffield A13 offered better speed and obstacle crossing performance . Despite this , it was decided that it was essential for some Tetrarchs to be produced , and it was suggested that they be brought in at the end of the light tank program . Accordingly , the War Office gave the Tetrarch the official General Staff specification number A17 , and , in November 1938 , accepted it for limited production after requesting a few minor changes which included the fitting of an external fuel tank to increase the tank 's range . 
- 
- = = = Production = = = 
- 
- The number to be produced was subject to fluctuation as the War Office vacillated in their demand ; in July 1938 , it requested that 70 of the tanks be produced , then increased the request to 120 after a three @-@ day conference in November . Production was to begin in July 1940 , but meanwhile the War Office temporarily returned to its original order of 70 before increasing the number to 100 . The number further increased to 220 after Metropolitan Cammell Carriage and Wagon , a company part owned by Vickers @-@ Armstrong that would be producing the tanks , indicated it had already ordered armour plating for that many tanks . 
- Production of the tank was delayed by a number of factors . The War Office put their order on hold in a post @-@ Battle of France decision to focus military production on infantry and cruiser tanks , due to the poor performance of British light tanks during that battle . Due to the shortage of more suitable tanks , light tanks that were not designed for use against German armour , were nevertheless deployed against them ; the resulting high casualties led the War Office to re @-@ evaluate the suitability of the light tank design . The pre @-@ war role of the light tank , that of reconnaissance , meanwhile had been found to be better suited to scout cars that used smaller crews and had better cross @-@ country abilities . Further delays were caused by the bombing raids of the Luftwaffe during May 1941 against the factories where the tanks were assembled . 
- The cumulative effect of these delays resulted in the production of only a small number of Mk VIIs ; estimates place the final total produced to be between 100 and 177 . The name ' Tetrarch ' was given to the Mk VII , on 22 September 1941 , on the orders of the War Office . The last of the tanks were built in the first quarter of 1942 and delivered at the end of the year . 
- 
- = = = Transfer to airborne role = = = 
- 
- The War Office and the Army were concluding , at this point , that light tanks were a liability and too vulnerable for use in further combat , and the Tetrarch was considered to be obsolete . This decision may have marked the end for the Tetrarch in active service ; several of the tanks destined to be deployed to the Eighth Army in the Middle East for the North African Campaign were left in Britain when their cooling systems were determined to be unable to cope with the intense North African heat . 
- The demise of Tetrarch was prevented by a decision made by the War Office in mid @-@ 1941 , as it was considering the equipment to be used by Britain 's fledgling airborne forces , formed in June 1940 under the orders of the Prime Minister , Winston Churchill . When selecting the equipment for the airborne forces , officials at the War Office concluded that gliders would be an integral component ; gliders would transport troops and heavy equipment , which , by 1941 , was to include artillery and some form of tank . Plans to transport a tank went through a number of revisions , but , by May 1941 , the feasibility of a 5 @.@ 5 metric tons ( 5 @.@ 4 long tons ) tank to be carried for 350 miles ( 560 km ) in a glider was accepted , although the aircraft would have to be specifically designed for the task . In a conference held on 16 January 1941 , it was decided that the General Aircraft Hamilcar , currently under development , would be used to transport a single Tetrarch tank or two Universal Carriers . The Tetrarch was chosen because it was an obsolete design , and was therefore available to be used by the airborne forces . 
- Beginning in January 1944 , training exercises were conducted carrying the Tetrarchs and their crews inside Hamilcar gliders . These exercises were successful ; during the training by ' C ' Squadron of the Glider Pilot Regiment , which specialised in flying the Hamilcars , over 2 @,@ 800 lifts were made with an average of 50 lifts per crew . Only three incidents resulted in fatalities or injuries , with seven pilots killed during the training . When the Tetrarch was re @-@ designated as an airborne tank , several changes were made to its design . A number of tanks had their 2 pounder guns replaced with a 76 @.@ 2 @-@ millimetre ( 3 @.@ 00 in ) infantry support howitzer ; these tanks were then designated as Tetrarch 1 CS ( Close Support ) . Additionally , Littlejohn adaptors were added to those Tetrarchs which still possessed their 2 pounders to increase their muzzle velocity and armour penetration . 
- The Tetrarch experienced several setbacks throughout its development and deployment with the Army and airborne forces . One of the major problems was the limited number of these tanks that existed after production ended in 1942 , which particularly affected the airborne forces . The transport of 20 of the tanks to the USSR under the Lend @-@ Lease Act depleted the number available for use by airborne forces , as did the loss of several more during Operation Ironclad , the invasion of Madagascar . A Royal Armoured Corps report issued in December 1942 stated that approximately 50 Tetrarchs were available for use . In a memorandum , dated January 1943 , by Major General George F. Hopkinson , commander of the 1st Airborne Division , Hopkinson complained that he had been informed that 70 of the tanks were available , whereas only 50 actually remained , with no reserves to replace those lost in combat . This lack of sufficient replacement reserves , combined with a War Office report that some 287 airborne tanks would be required for the 1st Airborne Division and an unnamed airborne division to be formed in India , led to the Tetrarch 's eventual replacement by the US M22 Locust . 
- 
- = = Performance = = 
- 
- A number of design faults of the Tetrarch were revealed through its operational use . Its size limited the possible crew to three , a driver in the hull and a gunner and commander in the turret , resulting in too few crew members to operate the Tetrarch effectively . The gunner or commander , in addition to his own duties , had to act as loader for the 2 pounder , which caused delays in combat . A report on the tank written in January 1941 stated that as the commander had to both fight and control the tank , controlling a troop of Tetrarchs during combat would be almost impossible . 
- Problems were also found with the Littlejohn adaptor fitted to the 2 @-@ pounder to increase its range and penetration power ; after they had been fitted the adapters could not be removed , and could only fire specially designed armour @-@ piercing rounds , which took time to manufacture . 
- The War Office also considered the Tetrarch 's cooling system faulty , making the tank unsuitable for service in hotter climates , such as the Middle East and North Africa . 
- 
- = = Operational history = = 
- 
- 
- = = = Lend @-@ lease = = = 
- 
- The first Tetrarchs were delivered to the Army in November 1940 , and were initially deployed with the 1st Armoured Division ( which was being refitted after losing the majority of its previous tanks during the Battle of France ) and the newly formed 6th Armoured Division . However , the faults discovered with the Tetrarch cooling system precluded them from being integrated into units that were sent to the Middle East to participate in the North African Campaign . Shortly after , all light tanks were discarded from the establishments of British armoured divisions as not suitable for further service . 
- The Tetrarchs remained in Britain , and would probably have been used as training vehicles before being retired from service , but on 22 June 1941 the German invasion of the USSR , Operation Barbarossa began , and the USSR became an ally of Britain . The Lend @-@ Lease program , begun in March 1941 by the United States of America to supply defensive materials to Britain and China , was therefore extended to the USSR . As part of the program , the British government began supplying war materials to the USSR , which in early 1942 , included a shipment of 20 Tetrarchs , as well as a number of Valentine and Matilda Mk I Infantry tanks . The Soviet military utilised a greater number of light tanks than the British , and so could use the Tetrarchs . When the tanks arrived in the USSR , however , it was apparent that the design problems with the cooling system were also present in cold conditions ; additionally , the cold weather had a deleterious effect on the tank 's suspension and tracks . Additional testing of the Tetrarchs was conducted by the Soviet military and the design was admired for its controllability , manoeuvrability , and speed , as well its ability to run on low @-@ quality fuel , unlike contemporary Soviet designs . The thinness of the Tetrarch 's armour was found to be a problem and one which could not be solved , as the weight of extra armour plating caused an unacceptable reduction in the tank 's speed . Despite these drawbacks in the Tetrarch 's design , Soviet authorities believed it to be comparable to the T @-@ 70 light tank in use at the time , and decided that it was suitable to be used in combat . A number of Tetrarchs were sent to Tank Training Schools which were subsequently sent into battle , and in September 1943 two were assigned to the 132nd Separated Tank Battalion , which was attached to the 5th Guards Tank Brigade ; both tanks were destroyed in combat , one on 30 September and the other on 2 October , the latter a casualty of artillery fire . Several were also used for propaganda purposes , appearing in photographs of Soviet troops who were fighting in the Caucasus region . 
- 
- = = = Operation Ironclad = = = 
- 
- In mid @-@ 1941 , the Royal Armoured Corps in Britain created three tank squadrons for special overseas operations , known as ' A ' , ' B ' and ' C ' Special Service Squadrons . Both ' A ' and ' B ' Squadrons were equipped with Valentine Infantry tanks and Mark VIc light tanks , but ' C ' Squadron was equipped with twelve Tetrarchs transferred from the 2nd Armoured Brigade , 1st Armoured Division . On 31 July 1941 , ' C ' Squadron was officially activated and immediately received orders to prepare for overseas service alongside ' A ' and ' B ' Squadrons in an unspecified tropical climate . All three squadrons were transported to Inverary in Scotland for intensive training that focused on embarkation and disembarkation from ships and landing craft to prepare them for action in potential amphibious operations . In early September , elements of ' C ' Squadron , including six Tetrarchs , formed part of a force which sailed for Freetown in West Africa ; during this period of the war there were fears that the Spanish government might enter the conflict on the side of Germany , and the force was readied to capture a number of Spanish islands off the coast of Africa if this occurred . These fears proved groundless , and in March 1942 , the unit returned to Britain to join the rest of the squadron in training . 
- The next assignment , Operation Ironclad , was the invasion of Madagascar , the third largest island in the world and then under Vichy French control . The Prime Minister and the Combined Chiefs of Staff decided that Madagascar should be occupied as rapidly as possible to deny the port of Antsirane to Japanese naval forces , which had recently advanced into the Indian Ocean . Operation Ironclad was under the command of Major General Robert G. Sturges and consisted of No. 5 Commando , 29th Independent Brigade Group , and the 17th and 13th brigade groups from 5th Infantry Division . The 29th Brigade formed the core of the invasion force due to its training in amphibious operations , and under its command was ' B ' Special Service Squadron , created by amalgamating six Valentines from ' B ' Squadron and six Tetrarchs from ' C ' Squadron into a single unit . The squadron was formed into four troops , one Headquarters troop of three Valentines and one Tetrarch , one of four Valentines , and two formed from the remaining five Tetrarchs . The invasion force assembled off the west coast of the northern tip of Madagascar on 4 May , near Antsirane and the bay of Diego Suarez . The invasion plan called for an amphibious assault landing on four beaches on the west side of the tip , which would allow the British forces to advance approximately 20 miles ( 32 km ) and approach Antsirane from the rear . Information about the landing beaches , the defences possessed by the port , and the Vichy French defending forces was limited and vague , although it was believed that the defenders had no weapons capable of penetrating the armour of a Valentine tank . 
- The landings began at 04 : 30 on 5 May , with 5 Commando landing at Courrier Bay and the three infantry brigades and ' B ' Squadron landing at Ambararata Bay . The objective of the infantry brigades and their armoured support was to take control of Antsirane and a nearby town , but although the infantry landed successfully , ' B ' Squadron had more trouble ; the area of beach designated for its landing craft was blocked for several hours after a Tetrarch came loose from a landing craft and became stuck in the sand . The infantry brigades advanced toward Antsirane without the squadron , but eventually two Valentines and a single Tetrarch were dispatched in support , catching up with the lead elements of the infantry near the town of Anamakia . Here the invasion force encountered the first French defences , consisting of camouflaged trenches and pillboxes dug in along a ridge . The tanks attempted to breach them , but the rocky ground made manoeuvring difficult and they could not close with the pillboxes and trenches ; they engaged a number of targets with 2 pounder and machine @-@ gun fire , but the line had to be cleared by an infantry assault later in the day . The tanks were ordered to outflank the defences and advance further into the island , and they were soon joined by two other Tetrarchs dispatched from the beaches ; the small force continued to advance until it encountered the Vichy French main line of defence . This had been built prior to the First World War and included camouflaged pillboxes , machine @-@ gun nests and dug @-@ in 75 mm artillery pieces ; the latter , although not specifically designed for an anti @-@ tank role , could penetrate the armour of both the Tetrarchs and the Valentines . The two Valentines advanced first but were knocked out by artillery fire , and two Tetrarchs that were moving behind them suffered the same fate ; the third Tetrarch retreated in order to report on the French resistance , machine gunning a motorcycle combination and a truck it encountered on the way back . 
- The commander of the Tetrarch made his report , and was then ordered to take command of four Valentines and two Tetrarchs which had recently arrived and once again attempt to breach the French defences . The tanks followed the road leading to the defensive line and then attempted to out @-@ flank the line by advancing from the right @-@ hand side , using several hills as cover ; the artillery pieces were able to turn and face the assault , however , and one Valentine and one Tetrarch were hit and destroyed . The remaining tanks exchanged several volleys of fire with the artillery pieces before retreating back to their original positions . The French line was eventually broken by 29th Brigade , aided by an amphibious assault by Royal Marines ; the remaining tanks of ' B ' Squadron , two Valentines and three Tetrarchs , remained in defensive positions until the afternoon of 6 May , coming under sporadic artillery fire which disabled another Valentine . The squadron played no further part in the battle , as the Vichy French authorities negotiated a formal surrender the following day , although French troops would continue to engage the British occupying force in guerrilla warfare until late November . ' C ' Squadron suffered heavy casualties during the invasion ; only one Valentine and three Tetrarchs out of twelve tanks were functional by 7 May , and the squadron had suffered seven killed and six wounded . It remained in Madagascar until early 1943 , when it was shipped to India and took part in the Burma Campaign as part of 29th Brigade . 
- 
- = = = Operation Tonga = = = 
- 
- Because of a lack of equipment training facilities in mid @-@ 1940 , when the British airborne establishment was formed , the War Office was able to accept only 500 volunteers for training as airborne troops . Progress in setting up proper training facilities and acquiring suitable transport aircraft was so slow that the first British airborne operation , Operation Colossus , was conducted by a retrained Commando unit . By 1942 , there existed specifically trained airborne units , including the 1st Airborne Division , and on 19 January 1942 the War Office decided that a light tank unit would be one of the support units attached to the division . This unit , designated the Light Tank Squadron , was to be formed of nineteen light tanks and would operate to the fore of the division , using their tanks ' speed to capture objectives and then holding them until relieved by other units . The obvious unit for conversion was ' C ' Special Services Squadron , as it was trained to act as an independent tank unit and , more importantly , was the only unit that was still using Tetrarchs ; it had been re @-@ designated as an airborne tank by the War Office . ' C ' Squadron was officially transferred to the 1st Airborne Division on 24 June 1942 , bringing with it seven Tetrarchs among its other vehicles . The unit immediately began training , but was not attached to the 1st Airborne Division for long ; during mid @-@ 1943 , the division was transported to the Middle East so it could participate in the Allied invasion of Sicily . ' C ' Squadron remained in Britain , as not enough Hamilcar gliders had been built by the time the division departed to transport its Tetrarchs ; the squadron was transferred to the 6th Airborne Division , which had been raised in April 1943 , and ' C ' Squadron remained with it for the rest of the conflict . The squadron continued to train as an air @-@ portable unit , and participated in a number of exercises to prepare for its new duties , including reconnaissance of enemy positions and counter @-@ attacking enemy infantry and armour . 
- On 13 December 1943 , the War Office decided to expand the squadron into a regiment equipped with a combination of light tanks and conventional reconnaissance vehicles such as scout cars , and on 1 April 1944 , it was re @-@ designated as the 6th Airborne Armoured Reconnaissance Regiment . The regiment consisted of a Headquarters Squadron , a Light Tank Squadron and a Reconnaissance Squadron ; two Tetrarchs , the Mark 1 CS variation , were attached to the Headquarters Squadron , but the Light Tank Squadron , also known as ' A ' Squadron , received the majority of the Tetrarchs . ' A ' Squadron had approximately nineteen Tetrarchs split between six troops , two of which were of the CS variation and the rest were armed with 2 pounders fitted with Littlejohn adaptors . On 24 May 1944 , after participating in a further series of exercises and manoeuvres , ' A ' Squadron moved from their training area to a transit camp at Tarrant Rushton airfield , while the rest of the regiment moved to RAF Brize Norton airfield the next day ; from these two airfields , the regiment would be transported from to participate in the British airborne landings in Normandy . The operation began on the night of 5 June , with the deployment of 6th Airborne Division to eastern Normandy . It was tasked with protecting the eastern flank of the Allied seaborne landings , securing strategically important areas east of Caen , capturing several important bridges over the Caen Canal and River Dives , and destroying a coastal artillery battery . Insufficient transport aircraft were available to land all three of the division 's brigades simultaneously ; one would have to be landed in a second lift later in the day . Major General Richard Gale had initially intended for the 6th Airlanding Brigade , to which the 6th Airborne Armoured Reconnaissance Regiment was attached , to be landed first ; however , aerial photography revealed that anti @-@ glider poles had been erected in the landing zone selected for the brigade . Therefore , Gale decided that the 3rd Parachute Brigade and 5th Parachute Brigade ( which did not utilise gliders ) should land in the first lift to clear the landing zones , allowing the 6th Airlanding Brigade to land in the second lift . 
- The Horsa and Hamilcar gliders of the brigade landed at 21 : 00 on 6 June in a landing zone cleared of obstructions by the 5th Parachute Brigade . The primary tasks of the brigade were to bring in reinforcements and supplies , and to aid the two parachute brigades in consolidating the area held by the division ; the 6th Airborne Armoured Reconnaissance Squadron was to aid in the latter task , acting as a reconnaissance force to scout out German positions and impede the movement of German forces attempting to counter @-@ attack . The Tetrarchs of ' A ' Squadron were to play an integral part in this reconnaissance role due to their speed , but the squadron 's strength of twenty tanks was severely depleted by the time it landed in Normandy . It lost one tank before the formation landed when the Tetrarch broke loose of its shackles and crashed through the nose of the glider that was carrying it , causing both to fall into the sea mid @-@ flight . The squadron 's strength was further weakened when two gliders collided with each other in the landing zone , destroying themselves and the Tetrarchs they carried ; a third Hamilcar hit another Tetrarch as it was being unloaded and flipped the tank upside down , rendering it unusable , although the crew escaped without injury . The surviving tanks were then rendered temporarily immobile when parachute rigging lines became tangled in their suspensions , forcing their crews to cut the lines away with welding torches . 
- The squadron retrieved all of the remaining Tetrarchs and advanced to the south of the landing zone to link up with the rest of the regiment ; there , they received orders to support the 8th Parachute Battalion in the Bois de Bavent area and conduct reconnaissance duties . After linking with the battalion , the squadron began reconnoitring , and engaged German infantry and armour they encountered . By the end of 7 June , two Tetrarchs had been lost to enemy action , one destroyed by a German self @-@ propelled gun and the second by hitting a mine . The division was reinforced by British troops who were advancing from the invasion beaches and it began to push through Normandy , while the squadron continued its reconnaissance duties . At this time , Gale decided to avoid , when possible , engaging the Tetrarchs with German armour , as they proved to be completely outclassed by the German tanks and self @-@ propelled guns , such as the Panzer IV and the Sturmgeschütz III . Instead , when the division required armoured support , it summoned it from armoured units outside the division , and the Tetrarchs were used to support infantry patrols and provide fire support . By August , in the division 's preparation for the planned breakout from the Normandy bridgehead , the majority of Tetrarchs in ' A ' Squadron were replaced with Cromwell fast cruiser tanks ; only three Tetrarchs remained , assigned to the Headquarters troop of ' A ' Squadron . 
- 
- = = = Post @-@ war = = = 
- 
- Operation Tonga was the last that Tetrarchs saw of active combat . During the first week of October 1944 , the 6th Airborne Armoured Reconnaissance Regiment underwent an extensive reorganization , in which it was completely restructured , and all the remaining Tetrarchs were retired . They were replaced with the M22 Locust , a purpose @-@ built airborne light tank of American design ; eight Locusts were used by the regiment in March 1945 during Operation Varsity , the airborne operation to cross the river Rhine . A report issued by the Director ( Air ) of the War Office in January 1946 confirmed that the Tetrarch design was considered obsolete , and any light tanks used in post @-@ war airborne formations would be entirely new in design . A small number of Tetrarchs remained in service with the 3rd Hussars until 1949 ; a Hamilcar glider flight was stationed at RAF Fairford , and a troop of Tetrarchs was kept by the regiment for training exercises with the gliders . However , glider training by the regiment was stopped in 1950 and the Tetrarchs withdrawn from service . 
- 
- = = Variants = = 
- 
- There were several variants of the Tetrarch design . The first was the Light Tank Mk VIII , Vickers @-@ Armstrong 's proposed successor to the Tetrarch . The Mark VIII was also known as the Harry Hopkins , named after President Roosevelt 's chief diplomatic advisor , and was given the General Staff design number A25 by the War Office . The Mark VIII was intended to improve upon the design of the Tetrarch in a number of areas . It had thicker armour than the Tetrarch , with the front hull and turret armour increased to a thickness of 38 millimetres ( 1 @.@ 5 in ) and the side armour to 17 millimetres ( 0 @.@ 67 in ) , and the turret and hull given more sloped surfaces to help deflect shells fired at the tank . The dimensions of the Tetrarch were also changed ; the Mark VIII was longer by 6 inches ( 0 @.@ 15 m ) , wider by 1 foot 3 inches ( 0 @.@ 38 m ) and heavier . The new tank was no longer air @-@ portable , as it was too heavy to be carried by a Hamilcar . The 12 @-@ cylinder engine of the Tetrarch was fitted to the Mark VIII , although the increased weight meant that its maximum speed decreased to 30 miles per hour ( 48 km / h ) ; its armament also remained the same as that of the Tetrarch . The War Office authorised the construction of three prototype models in April 1941 . The new design was considered a success , and the Tank Board of the War Office ordered 1 @,@ 000 to be constructed in September . However , problems were encountered with further tests of the prototypes , and a report issued in December 1942 stated that production of the Mark VIII had been delayed due to developmental problems . These problems continued to persist into 1943 , when the War Office decided against using the tank in active service ; approximately 100 Mark VIIIs were produced by 1945 , when production ended . 
- A second variant on the Tetrarch design was the Tetrarch Duplex Drive ( " Tetrarch DD " ) . The Duplex Drive system was invented by Nicholas Straussler , and was designed to allow a tank to ' swim ' through water and participate in amphibious operations . The system functioned by erecting a large waterproof canvas screen around the tank above its tracks , which was supported by thirty @-@ six inflatable tubes and steel struts ; this gave the tank sufficient buoyancy to float , and was then propelled along by a small propeller powered by the tank 's engine . The screen could be collapsed by using a small explosive charge once the tank reached land . The system was fitted during June 1941 , as the Tetrarch was the lightest light tank available at the time ; the converted tank was successfully tested on a number of lakes and reservoirs , allowing the Duplex Drive system to be tested on heavier tanks , such as the Valentine . The system would be used during Operation Overlord , when M4 Sherman medium tanks would land on the invasion beaches . 
- 
- 
- = QuackShot = 
- 
- QuackShot , known in Japan as I Love Donald Duck : Georgia Ou no Hihou ( Japanese : アイラブドナルドダック グルジア王の秘宝 , Hepburn : Ai Rabu Donarudo Dakku Gurujia Ō no Hihō ) , is a 1991 platforming video game developed and published by Sega for the Sega Genesis . The game was released in Europe in 1991 , in North America on December 19 , 1991 and in Japan on December 20 , 1991 . QuackShot stars Donald Duck and his three nephews , Huey , Dewey , and Louie , as treasure @-@ hunters , and is part of a series of games published by Sega that were based on Walt Disney cartoon characters . 
- QuackShot was released to mostly positive reviews from video game journalists . The game was universally lauded for its graphics , with magazines like Sega Pro describing them as " some of the best graphics around . " The game was also praised for its music and puzzles , as well as their clever use in the game . However , QuackShot was criticized for its controls , being described by IGN as " float @-@ y " and making certain segments of the game unnecessarily difficult . The game was also criticized for its lack of difficulty overall as well as its lack of speech samples , which several other Genesis games of the time had . 
- 
- = = Gameplay = = 
- 
- The player , as Donald , ventures through a variety of side @-@ scrolling levels . Generally , each level is divided into an overland part and a dungeon , such as the Maharajah 's palace or the temple in which the Great Duck Treasure resides . Although the player may choose any order to play the overland sections , various obstacles prevent the player from entering the dungeons outside of a specific order . In addition to this , some levels provide the player with vital clues which solve puzzles needed to progress in later sections . Once Donald has completed the overland section of an area , he may leave by calling his nephews ' biplane , and will return to the dungeon entrance of that area if the player chooses to return . 
- Donald is armed with a special gun that can shoot plungers , popcorn or bubble gum . Donald has unlimited plungers which can only temporarily stun enemies ( though bosses can still be damaged with plungers ) , and can collect popcorn and gum along the way or get the latter from Gyro Gearloose . Later in the game , the plunger is upgraded to act as a temporary platform to climb walls with and , when stuck to a passing bird , allows Donald to traverse longer distances . In Duckburg , India and Egypt , Donald can also pick up chili peppers which increase his temper , eventually temporarily allowing him to become invincible , increase his speed and knock out enemies in his path . 
- 
- = = Plot = = 
- 
- While Donald is flipping through some books in Scrooge McDuck 's library , a map falls out of a book relating to the treasure of King Garuzia , ruler of the Great Duck Kingdom in ancient times . The map leads to the location of the king 's most prized possession , hidden in a secret place shortly before his death . Donald thinks this is his path to riches . Unfortunately Big Bad Pete overhears and pursues Donald throughout the game hoping to steal the treasure . 
- Teamed with his nephews Huey , Dewey , and Louie , and using the partial map from the library , Donald begins his search in Duckburg , with the trail being directed to an Aztec pyramid in Mexico . Outside the pyramid , he is directed by a " sweet seniorita " to obtain a " hero key " from an explorer back in Duckburg to open the pyramid . Inside the pyramid , Donald meets Goofy , who gives him a strange note and a plunger to help him reach higher places , and tells him that Gyro Gearloose is looking for him back in Duckburg . Travelling across the rooftops of Duckburg to meet Gyro , Donald is given Gyro 's latest invention , bubblegum ammo that can break through walls . The last location on the partial map is Count Dracula 's castle in Transylvania , where Donald encounters a ghost who tells him that the Count carries the real treasure map . 
- After defeating Dracula , Donald receives a more complete map . In India , Donald enters the palace of the Maharajah , where she challenges him to defeat the tiger in her garden in exchange for a Sphinx Tear . Donald succeeds and receives the Sphinx Tear , which is the key to open a temple in Egypt . Donald is able to solve the " Riddle of the Sphinx " using the note Goofy had given him , and obtains the Scepter of Ra before escaping in a mine cart . From there , he journeys to the South Pole , where he finds a key frozen in ice , and uses the Scepter of Ra to melt the ice and grab the key . The key unlocks the hold of a Viking ship , which contains an ancient diary with the secret to locating the treasure . The ship is haunted by ghosts , and the Viking captain sends Donald below decks to get rid of them . After defeating a skeletal Viking warrior , Donald returns to the deck , where the captain informs him that the diary is hidden in ice near the South Pole , and gives him an " ancient Viking plunger " that attaches to flying creatures . Donald then returns to the South Pole , hitching a ride on one of Pete 's bird minions to reach the diary . 
- However , upon finding the diary , Pete shows up , holding Donald 's nephews hostage in exchange for the diary . After giving Pete the diary , Donald travels to Pete 's hideout to defeat Pete and get the diary back . The diary reveals that the map , when dipped in water , will reveal the location of the Great Duck Treasure . Donald flies to the island where the treasure is hidden and manages to evade its traps in order to reach the treasure vault . After defeating the ancient spirit guarding the treasure , Donald opens the vault only to find a simple stone statue . When the disappointed Donald returns home , Huey , Dewey and Louie accidentally break the statue , which reveals a golden jeweled necklace was hidden inside . Donald gives the necklace to Daisy and the two fly off into the sunset together . 
- 
- = = Development and release = = 
- 
- QuackShot was developed and published by Sega for the Sega Genesis as part of a series of games that were based on Walt Disney cartoon characters . The game was released in Europe in 1991 , in North America on December 19 , 1991 and in Japan on December 20 , 1991 . QuackShot was later released as part of a bundle called The Disney Collection for Genesis in 1996 alongside Castle of Illusion . The game was also ported to the Sega Saturn and released exclusively in Japan alongside Castle of Illusion again as part of the Sega Ages series in 1998 , entitled Sega Ages : I Love Mickey Mouse . 
- 
- = = Reception = = 
- 
- QuackShot received a mostly positive response from critics upon release . GameRankings , an aggregator for video game reviews , assigned the game a score of 77 % based on 2 reviews . Mega placed the game at # 7 in their " Top Mega Drive Games of All Time " list . MegaTech magazine praised the game 's graphics , but criticized the game 's easy difficulty level , explaining simply that " the graphics are excellent , but the game is easy to complete . " Damian Butt from Sega Pro also praised the graphics , stating that the game has " without [ a ] doubt some of the best graphics around " and that " the sprites and backgrounds are consistently excellent . " He also noted the game 's various puzzles and their use in the game , explaining that " [ e ] ven if the ideas are not original , the way they are strung together to accelerate the pace to overload is nothing short of breath @-@ taking . " Levi Buchanan from IGN gave QuackShot a 7 @.@ 3 / 10 , also lauding the graphics and animation as excellent and saying the music was pleasing . 
- Butt criticized Donald 's controls in certain situations in the game , as well as the difficulty of some levels and puzzles . Buchanan also criticized the controls , calling them " float @-@ y " and noted the difficulty in executing precision jumps , explaining that " [ i ] t 's far too easy to over- or under @-@ shoot a narrow column and slip to your doom . " Butt was also " dubious of the number of credits , " stating that the game may seem easy with unlimited continues , but that the player will " still need considerable skill to reach the treasure island . " Buchanan was disappointed with the lack of speech samples , explaining that it 's " a bit of a drag with a character that is so defined by his voice . " Ultimately , Butt said that " [ y ] ounger players will instantly be enthralled by Donald 's quest " and that " QuackShot is everything a cartoon game should be and more . " Buchanan summed up the game as being a " good platformer tripped up by some questionable controls " and recommended the game as " a mildly enjoyable 16 @-@ bit platformer that would fit nicely in your Genesis collection . " 
- 
- 
- = Olmec colossal heads = 
- 
- The Olmec colossal heads are at least seventeen monumental stone representations of human heads sculpted from large basalt boulders . The heads date from at least before 900 BC and are a distinctive feature of the Olmec civilization of ancient Mesoamerica . All portray mature men with fleshy cheeks , flat noses , and slightly crossed eyes ; their physical characteristics correspond to a type that is still common among the inhabitants of Tabasco and Veracruz . The backs of the monuments often are flat . The boulders were brought from the Sierra de los Tuxtlas mountains of Veracruz . Given that the extremely large slabs of stone used in their production were transported over large distances , requiring a great deal of human effort and resources , it is thought that the monuments represent portraits of powerful individual Olmec rulers . Each of the known examples has a distinctive headdress . The heads were variously arranged in lines or groups at major Olmec centres , but the method and logistics used to transport the stone to these sites remain unclear . 
- The discovery of a colossal head at Tres Zapotes in the nineteenth century spurred the first archaeological investigations of Olmec culture by Matthew Stirling in 1938 . Seventeen confirmed examples are known from four sites within the Olmec heartland on the Gulf Coast of Mexico . Most colossal heads were sculpted from spherical boulders but two from San Lorenzo Tenochtitlán were re @-@ carved from massive stone thrones . An additional monument , at Takalik Abaj in Guatemala , is a throne that may have been carved from a colossal head . This is the only known example from outside the Olmec heartland . 
- Dating the monuments remains difficult because of the movement of many from their original contexts prior to archaeological investigation . Most have been dated to the Early Preclassic period ( 1500 – 1000 BC ) with some to the Middle Preclassic ( 1000 – 400 BC ) period . The smallest weigh 6 tons , while the largest is variously estimated to weigh 40 to 50 tons , although it was abandoned and left unfinished close to the source of its stone . 
- 
- = = Olmec civilization = = 
- 
- The Olmec civilization developed in the lowlands of southeastern Mexico between 1500 and 400 BC . The Olmec heartland lies on the Gulf Coast of Mexico within the states of Veracruz and Tabasco , an area measuring approximately 275 kilometres ( 171 mi ) east to west and extending about 100 kilometres ( 62 mi ) inland from the coast . The Olmecs are regarded as the first civilization to develop in Mesoamerica and the Olmec heartland is one of six cradles of civilization worldwide , the others being the Norte Chico culture of South America , the Erlitou culture of China 's Yellow River , the Indus Valley Civilization of south Asia , the civilization of ancient Egypt and the Sumerian civilization of ancient Iraq . Of these , only the Olmec civilization developed in a lowland tropical forest setting . 
- The Olmecs were the first inhabitants of the Americas to construct monumental architecture and to settle in towns and cities . They were also the first people in the Americas to develop a sophisticated style of stone sculpture . In the first decade of the 21st century evidence emerged of Olmec writing , with the earliest examples of Olmec hieroglyphs dating to around 650 BC . Examples of script have been found on roller stamps and stone artefacts ; the texts are short and have been partially deciphered based on their similarity to other Mesoamerican scripts . The evidence of complex society developing in the Olmec heartland has led to the Olmecs being regarded as the " Mother Culture " of Mesoamerica , although this concept remains controversial . 
- Some of the Olmecs ' rulers seem to have served religious functions . The city of San Lorenzo was succeeded as the main centre of the civilization by La Venta in about 900 BC , with Tres Zapotes and Laguna de los Cerros possibly sharing the role ; other urban centres were much less significant . The nature and degree of the control exercised by the centres over a widespread rural population remains unclear . Very fine Olmec art , much clearly made for an elite , survives in several forms , notably Olmec figurines , and larger sculptures such as The Wrestler . The figurines have been recovered in large numbers and are mostly in pottery ; these were presumably widely available to the population . Together with these , of particular relevance to the colossal heads are the " Olmec @-@ style masks " in stone , so called because none have yet been excavated in circumstances that allow the proper archaeological identification of an Olmec context . These evocative stone face masks present both similarities and differences to the colossal heads . Two thirds of Olmec monumental sculpture represents the human form , and the colossal heads fall within this major theme of Olmec art . 
- 
- = = Dating = = 
- 
- The colossal heads cannot be precisely dated . However , the San Lorenzo heads were buried by 900 BC , indicating that their period of manufacture and use was earlier still . The heads from Tres Zapotes had been moved from their original context before they were investigated by archaeologists and the heads from La Venta were found partially exposed on the modern ground surface . The period of production of the colossal heads is therefore unknown , as is whether it spanned a century or a millennium . Estimates of the time span during which colossal heads were produced vary from 50 to 200 years . The San Lorenzo heads are believed to be the oldest , and are the most skillfully executed . All of the stone heads have been assigned to the Preclassic period of Mesoamerican chronology , generally to the Early Preclassic ( 1500 – 1000 BC ) , although the two Tres Zapotes heads and the La Cobata Head are attributed to the Middle Preclassic ( 1000 – 400 BC ) . 
- 
- = = Characteristics = = 
- 
- Olmec colossal heads vary in height from 1 @.@ 47 to 3 @.@ 4 metres ( 4 @.@ 8 to 11 @.@ 2 ft ) and weigh between 6 and 50 tons . All of the Olmec colossal heads depict mature men with flat noses and fleshy cheeks ; the eyes tend to be slightly crossed . The general physical characteristics of the heads are of a type that is still common among people in the Olmec region in modern times . The backs of the heads are often flat , as if the monuments were originally placed against a wall . All examples of Olmec colossal heads wear distinctive headdresses that probably represent cloth or animal hide originals . Some examples have a tied knot at the back of the head , and some are decorated with feathers . A head from La Venta is decorated with the head of a bird . There are similarities between the headdresses on some of the heads that has led to speculation that specific headdresses may represent different dynasties , or perhaps identify specific rulers . Most of the heads wear large earspools inserted into the ear lobes . 
- All of the heads are realistic , unidealised and frank descriptions of the men . It is likely that they were portraits of living ( or recently deceased ) rulers well known to the sculptors . Each head is distinct and naturalistic , displaying individualised features . They were once thought to represent ballplayers although this theory is no longer widely held ; it is possible , however , that they represent rulers equipped for the Mesoamerican ballgame . Facial expressions depicted on the heads vary from stern through placid to smiling . The most naturalistic Olmec art is the earliest , appearing suddenly without surviving antecedents , with a tendency towards more stylised sculpture as time progressed . Some surviving examples of wooden sculpture recovered from El Manatí demonstrate that the Olmecs are likely to have created many more perishable sculptures than works sculpted from stone . 
- In the late nineteenth century , José Melgar y Serrano described a colossal head as having " Ethiopian " features and speculations that the Olmec had African origins resurfaced in 1960 in the work of Alfonso Medellín Zenil and in the 1970s in the writings of Ivan van Sertima . Such speculation is not taken seriously by Mesoamerican scholars such as Richard Diehl and Ann Cyphers . 
- Although all the colossal heads are broadly similar , there are distinct stylistic differences in their execution . One of the heads from San Lorenzo bears traces of plaster and red paint , suggesting that the heads were originally brightly decorated . Heads did not just represent individual Olmec rulers ; they also incorporated the very concept of rulership itself . 
- 
- = = Manufacture = = 
- 
- The production of each colossal head must have been carefully planned , given the effort required to ensure the necessary resources were available ; it seems likely that only the more powerful Olmec rulers were able to mobilise such resources . The workforce would have included sculptors , labourers , overseers , boatmen , woodworkers and other artesans producing the tools to make and move the monument , in addition to the support needed to feed and otherwise attend to these workers . The seasonal and agricultural cycles and river levels needed to have been taken into account to plan the production of the monument and the whole project may well have taken years from beginning to end . 
- Archaeological investigation of Olmec basalt workshops suggest that the colossal heads were first roughly shaped using direct percussion to chip away both large and small flakes of stone . The sculpture was then refined by retouching the surface using hammerstones , which were generally rounded cobbles that could be of the same basalt as the monument itself , although this was not always the case . Abrasives were found in association with workshops at San Lorenzo , indicating their use in the finishing of fine detail . Olmec colossal heads were fashioned as in @-@ the @-@ round monuments with varying levels of relief on the same work ; they tended to feature higher relief on the face and lower relief on the earspools and headdresses . Monument 20 at San Lorenzo is an extensively damaged throne with a figure emerging from a niche . Its sides were broken away and it was dragged to another location before being abandoned . It is possible that this damage was caused by the initial stages of re @-@ carving the monument into a colossal head but that the work was never completed . 
- All seventeen of the confirmed heads in the Olmec heartland were sculpted from basalt mined in the Sierra de los Tuxtlas mountains of Veracruz . Most were formed from coarse grained dark grey basalt known as Cerro Cintepec basalt after a volcano in the range . Investigators have proposed that large Cerro Cintepec basalt boulders found on the southeastern slopes of the mountains are the source of the stone for the monuments . These boulders are found in an area affected by large lahars ( volcanic mudslides ) that carried substantial blocks of stone down the mountain slopes , which suggests that the Olmecs did not need to quarry the raw material for sculpting the heads . Roughly spherical boulders were carefully selected to mimic the shape of a human head . The stone for the San Lorenzo and La Venta heads was transported a considerable distance from the source . The La Cobata head was found on El Vigia hill in the Sierra de los Tuxtlas and the stone from Tres Zapotes Colossal Head 1 and Nestepe Colossal Head 1 ( also known as Tres Zapotes Monuments A and Q ) came from the same hill . 
- The boulders were transported over 150 kilometres ( 93 mi ) from the source of the stone . The exact method of transportation of such large masses of rock are unknown , especially since the Olmecs lacked beasts of burden and functional wheels , and they were likely to have used water transport whenever possible . Coastal currents of the Gulf of Mexico and in river estuaries might have made the waterborne transport of monuments weighing 20 tons or more impractical . Two badly damaged Olmec sculptures depict rectangular stone blocks bound with ropes . A largely destroyed human figure rides upon each block , with their legs hanging over the side . These sculptures may well depict Olmec rulers overseeing the transport of the stone that would be fashioned into their monuments . When transport over land was necessary , the Olmecs are likely to have used causeways , ramps and roads to facilitate moving the heads . The regional terrain offers significant obstacles such as swamps and floodplains ; avoiding these would have necessitated crossing undulating hill country . The construction of temporary causeways using the suitable and plentiful floodplain soils would have allowed a direct route across the floodplains to the San Lorenzo Plateau . Earth structures such as mounds , platforms and causeways upon the plateau demonstrate that the Olmec possessed the necessary knowledge and could commit the resources to build large @-@ scale earthworks . 
- The flat backs of many of the colossal heads represented the flat bases of the monumental thrones from which they were reworked . Only four of the seventeen heartland heads do not have flattened backs , indicating the possibility that the majority were reworked monuments . Alternatively , the backs of many of these massive monuments may have been flattened to ease their transport , providing a stable form for hauling the monuments with ropes . Two heads from San Lorenzo have traces of niches that are characteristic of monumental Olmec thrones and so were definitely reworked from earlier monuments . 
- 
- = = Known monuments = = 
- 
- Seventeen confirmed examples are known . An additional monument , at Takalik Abaj in Guatemala , is a throne that may have been carved from a colossal head . This is the only known example outside of the Olmec heartland on the Gulf Coast of Mexico . Possible fragments of additional colossal heads have been recovered at San Lorenzo and at San Fernando in Tabasco . Crude colossal stone heads are also known in the Southern Maya area where they are associated with the potbelly style of sculpture . Although some arguments have been made that they are pre @-@ Olmec , these latter monuments are generally believed to be influenced by the Olmec style of sculpture . 
- 
- = = = San Lorenzo = = = 
- 
- The ten colossal heads from San Lorenzo originally formed two roughly parallel lines running north @-@ south across the site . Although some were recovered from ravines , they were found close to their original placements and had been buried by local erosion . These heads , together with a number of monumental stone thrones , probably formed a processional route across the site , powerfully displaying its dynastic history . Two of the San Lorenzo heads had been re @-@ carved from older thrones . 
- San Lorenzo Colossal Head 1 ( also known as San Lorenzo Monument 1 ) was lying facing upwards when excavated . The erosion of a path passing on top of the monument uncovered its eye and led to the discovery of the Olmec site . Colossal Head 1 is 2 @.@ 84 metres ( 9 @.@ 3 ft ) high ; it measures 2 @.@ 11 metres ( 6 @.@ 9 ft ) wide and it weighs 25 @.@ 3 tons . The monument was discovered partially buried at the edge of a gully by Matthew Stirling in 1945 . When discovered it was lying on its back , looking upwards . It was associated with a large number of broken ceramic vessels and figurines . The majority of these ceramic remains have been dated to between 800 and 400 BC ; some pieces have been dated to the Villa Alta phase ( Late Classic period , 800 – 1000 AD ) . The headdress possesses a plain band that is tied at the back of the head . The upper portion of the headdress is decorated with a U @-@ shaped motif . This element descends across the front of the headdress , terminating on the forehead . On the front portion it is decorated with five semicircular motifs . The scalp piece does not meet the horizontal band , leaving a space between the two pieces . On each side of the face a strap descends from the headdress and passes in front of the ear . The forehead is wrinkled in a frown . The lips are slightly parted without revealing the teeth . The cheeks are pronounced and the ears are particularly well executed . The face is slightly asymmetric , which may be due to error on the part of the sculptors or may accurately reflect the physical features of the portrait 's subject . The head has been moved to the Museo de Antropología de Xalapa ( " Anthropological Museum of Xalapa " ) . 
- San Lorenzo Colossal Head 2 ( also known as San Lorenzo Monument 2 ) was reworked from a monumental throne . The head stands 2 @.@ 69 metres ( 8 @.@ 8 ft ) high and measures 1 @.@ 83 metres ( 6 @.@ 0 ft ) wide by 1 @.@ 05 metres ( 3 @.@ 4 ft ) deep ; it weighs 20 tons . Colossal Head 2 was discovered in 1945 when Matthew Stirling 's guide cleared away some of the vegetation and mud that covered it . The monument was found lying on its back , facing the sky , and was excavated in 1946 by Stirling and Philip Drucker . In 1962 the monument was removed from the San Lorenzo plateau in order to put it on display as part of " The Olmec tradition " exhibition at the Museum of Fine Arts in Houston in 1963 . San Lorenzo Colossal Head 2 is currently in the Museo Nacional de Antropología in Mexico City . The head was associated with a number of ceramic finds ; they have been dated to the Early Preclassic and Late Classic periods . Colossal Head 2 wears a complex headdress that sports a horizontal band tied at the back of the head ; this is decorated with three bird 's heads that are located above the forehead and temples . The scalp piece is formed from six strips running towards the back of the head . The front of the headdress above the horizontal band is plain . Two short straps hang down from the headdress in front of the ears . The ear jewellery is formed by large squared hoops or framed discs . The left and right ornaments are different , with radial lines on the left earflare , a feature absent on the right earflare . The head is badly damaged due to an unfinished reworking process . This process has pitmarked the entire face with at least 60 smaller hollows and 2 larger holes . The surviving features appear to depict an ageing man with the forehead creased into a frown . The lips are thick and slightly parted to reveal the teeth ; the head has a pronounced chin . 
- San Lorenzo Colossal Head 3 is also known as San Lorenzo Monument 3 . The head measures 1 @.@ 78 metres ( 5 @.@ 8 ft ) high by 1 @.@ 63 metres ( 5 @.@ 3 ft ) wide by 0 @.@ 95 metres ( 3 @.@ 1 ft ) deep and weighs 9 @.@ 4 tons . The head was discovered in a deep gully by Matthew Stirling in 1946 ; it was found lying face down and its excavation was difficult due to the wet conditions in the gully . The monument was found 0 @.@ 8 kilometres ( 0 @.@ 50 mi ) southwest of the main mound at San Lorenzo , however , its original location is unknown ; erosion of the gully may have resulted in significant movement of the sculpture . Head 3 has been moved to the Museo de Antropología de Xalapa . The headdress is complex , with the horizontal basal band being formed by four horizontal cords , with diagonal folds above each eye . A small skullcap tops the headdress . A large flap formed of four cords drops down both sides of the head , completely covering the ears . The face has a typically frowning brow and , unusually , has clearly defined eyelids . The lips are thick and slightly parted ; the front of the lower lip has broken away completely , and the lower front of the headdress is pitted with 27 irregularly spaced artificial depressions . 
- San Lorenzo Colossal Head 4 ( also known as San Lorenzo Monument 4 ) weighs 6 tons and has been moved to the Museo de Antropología de Xalapa . Colossal Head 4 is 1 @.@ 78 metres ( 5 @.@ 8 ft ) high , 1 @.@ 17 metres ( 3 @.@ 8 ft ) wide and 0 @.@ 95 metres ( 3 @.@ 1 ft ) deep . The head was discovered by Matthew Stirling in 1946 , 550 metres ( 600 yd ) northwest of the principal mound , at the edge of a gully . When excavated , it was found to be lying on its right @-@ hand side and in a very good state of preservation . Ceramic materials excavated with the head became mixed with ceramics associated with Head 5 , making ceramic dating of the monument difficult . The headdress is decorated with a horizontal band formed of four sculpted cords , similar to those of Head 3 . On the right @-@ hand side , three tassels descend from the upper portion of the headdress ; they terminate in a total of eight strips that hang down across the horizontal band . These tassels are judged to represent hair rather than cords . Also on the right hand side , two cords descend across the ear and continue to the base of the monument . On the left @-@ hand side , three vertical cords descend across the ear . The earflare is only visible on the right hand side ; it is formed of a plain disc and peg . The face is that of an ageing man with a creased forehead , low cheekbones and a prominent chin . The lips are thick and slightly parted . 
- San Lorenzo Colossal Head 5 is also known as San Lorenzo Monument 5 . The monument stands 1 @.@ 86 metres ( 6 @.@ 1 ft ) high and measures 1 @.@ 47 metres ( 4 @.@ 8 ft ) wide by 1 @.@ 15 metres ( 3 @.@ 8 ft ) deep . It weighs 11 @.@ 6 tons . The head was discovered by Matthew Stirling in 1946 , face down in a gully to the south of the principal mound . The head is particularly well executed and is likely to have been found close to its original location . Ceramics recovered during its excavation became mixed with those from the excavation of Head 4 . The mixed ceramics have been dated to the San Lorenzo and Villa Alta phases ( approximately 1400 – 1000 BC and 800 – 1000 AD respectively ) . Colossal Head 5 is particularly well preserved , although the back of the headdress band was damaged when the head was moved from the archaeological site . The band of the headdress is set at an angle and has a notch above the bridge of the nose . The headdress is decorated with jaguar paws ; this general identification of the decoration is contested by Beatriz de la Fuente since the " paws " have three claws each ; she identifies them as the claws of a bird of prey . At the back of the head , ten interlaced strips form a net decorated with disc motifs . Two short straps descend from the headdress in front of the ears . The ears are adorned with disc @-@ shaped earspools with pegs . The face is that of an ageing man with wrinkles under the eyes and across the bridge of the nose , and a forehead that is creased in a frown . The lips are slightly parted . Colossal Head 5 has been moved to the Museo de Antropología de Xalapa . 
- San Lorenzo Colossal Head 6 ( also known as San Lorenzo Monument 17 ) is one of the smaller examples of colossal heads , standing 1 @.@ 67 metres ( 5 @.@ 5 ft ) . It measures 1 @.@ 41 metres ( 4 @.@ 6 ft ) wide by 1 @.@ 26 metres ( 4 @.@ 1 ft ) deep and is estimated to weigh between 8 and 10 tons . The head was discovered by a local farmworker and was excavated in 1965 by Luis Aveleyra and Román Piña Chan . The head had collapsed into a ravine under its own weight and was found face down on its left hand side . In 1970 it was transported to the Metropolitan Museum of Art in New York for the museum 's centenary exhibition . After its return to Mexico , it was placed in the Museo Nacional de Antropología in Mexico City . It is sculpted with a net @-@ like head covering joined together with sculpted beads . A covering descends from under the headdress to cover the back half of the neck . The headband is divided into four strips and begins above the right ear , extending around the entire head . A short strap descends from either side of the head to the ear . The ear ornaments are complex and are larger at the front of the ear than at the back . The face is that of an ageing male with the forehead creased in a frown , wrinkles under the eyes , sagging cheeks and deep creases on either side of the nose . The face is somewhat asymmetric , possibly due to errors in the execution of the monument . 
- San Lorenzo Colossal Head 7 ( also known as San Lorenzo Monument 53 ) measures 2 @.@ 7 metres ( 8 @.@ 9 ft ) high by 1 @.@ 85 metres ( 6 @.@ 1 ft ) wide by 1 @.@ 35 metres ( 4 @.@ 4 ft ) deep and weighs 18 tons . San Lorenzo Colossal Head 7 was reworked from a monumental throne ; it was discovered by a joint archaeological project by the Instituto Nacional de Antropología e Historia and Yale University , as a result of a magnetometer survey . It was buried at a depth of less than 1 metre ( 3 @.@ 3 ft ) and was lying facing upwards , leaning slightly northwards on its right hand side . The head is poorly preserved and has suffered both from erosion and deliberate damage . The headdress is decorated with a pair of human hands ; a feathered ornament is carved at the back of the headband and two discs adorn the front . A short strap descends from the headband and hangs in front of the right ear . The head sports large earflares that completely cover the earlobes , although severe erosion makes their exact form difficult to distinguish . The face has wrinkles between the nose and cheeks , sagging cheeks and deep @-@ set eyes ; the lips are badly damaged and the mouth is open , displaying the teeth . In 1986 the head was transported to the Museo de Antropología de Xalapa . 
- San Lorenzo Colossal Head 8 ( also known as San Lorenzo Monument 61 ) stands 2 @.@ 2 metres ( 7 @.@ 2 ft ) high ; it measures 1 @.@ 65 metres ( 5 @.@ 4 ft ) wide by 1 @.@ 6 metres ( 5 @.@ 2 ft ) deep and weighs 13 tons . It is one of the finest examples of an Olmec colossal head . It was found lying on its side to the south of a monumental throne . The monument was discovered at a depth of 5 metres ( 16 ft ) during a magnetometer survey of the site in 1968 ; it has been dated to the Early Preclassic . After discovery it was initially reburied ; it was moved to the Museo de Antropología de Xalapa in 1986 . The headdress is decorated with the talons or claws of either a jaguar or an eagle . It has a headband and a cover that descends from under the headdress proper behind the ears . Two short straps descend in front of the ears . The head sports large ear ornaments in the form of pegs . The face is that of a mature male with sagging cheeks and wrinkles between these and the nose . The forehead is gathered in a frown . The mouth is slightly parted to reveal the teeth . Most of the head is carved in a realistic manner , the exception being the ears . These are stylised and represented by one question mark shape contained within another . The head is very well preserved and displays a fine finish . 
- San Lorenzo Colossal Head 9 is also known as San Lorenzo Monument 66 . It measures 1 @.@ 65 metres ( 5 @.@ 4 ft ) high by 1 @.@ 36 metres ( 4 @.@ 5 ft ) wide by 1 @.@ 17 metres ( 3 @.@ 8 ft ) deep . The head was exposed in 1982 by erosion of the gullies at San Lorenzo ; it was found leaning slightly on its right hand side and facing upwards , half covered by the collapsed side of a gully and washed by a stream . Although it was documented by archaeologists , it remained for some time in its place of discovery before being moved to the Museo de Antropología de Xalapa . The headdress is of a single piece without a distinct headband . The sides display features that are possibly intended to represent long hair trailing to the bottom of the monument . The earflares are rectangular plates with an additional trapezoid element at the front . The head is also depicted wearing a nose @-@ ring . The face is smiling and has wrinkles under the eyes and at the edge of the mouth . It has sagging cheeks and wide eyes . The mouth is closed and the upper lip is badly damaged . The sculpture suffered some mutilation in antiquity , with nine pits hollowed into the face and headdress . 
- San Lorenzo Colossal Head 10 ( also known as San Lorenzo Monument 89 ) has been moved to the Museo Comunitario de San Lorenzo Tenochtitlán near Texistepec . It stands 1 @.@ 8 metres ( 5 @.@ 9 ft ) tall and measures 1 @.@ 43 metres ( 4 @.@ 7 ft ) wide by 0 @.@ 92 metres ( 3 @.@ 0 ft ) deep ; it weighs 8 tons . The head was discovered by a magnetometer survey in 1994 ; it was found buried , lying face upwards in the bottom of a ravine and was excavated by Ann Cyphers . The headdress is formed of 92 circular beads that completely cover the upper part of the head and descend across the sides and back . Above the forehead is a large element forming a three @-@ toed foot with long nails , possibly the foot of a bird . The head wears large earspools that protrude beyond the beads of the headdress . The spools have the form of a rounded square with a circular sunken central portion . The face is that of a mature man with the mouth closed , sagging cheeks and lines under the eyes . The mouth is sensitively carved and the head possesses a pronounced chin . 
- 
- = = = La Venta = = = 
- 
- Three of the La Venta heads were found in a line running east @-@ west in the northern Complex I ; all three faced northwards , away from the city centre . The other head was found in Complex B to the south of the Great Pyramid , in a plaza that included a number of other sculptures . The latter , the first of the La Venta heads to be discovered , was found during archaeological exploration of La Venta in 1925 ; the other three remained unknown to archaeologists until a local boy guided Matthew Stirling to them while he was excavating the first head in 1940 . They were located approximately 0 @.@ 9 kilometres ( 0 @.@ 56 mi ) to the north of Monument 1 . 
- La Venta Monument 1 is speculated to have been the portrait of La Venta 's final ruler . Monument 1 measures 2 @.@ 41 metres ( 7 @.@ 9 ft ) high by 2 @.@ 08 metres ( 6 @.@ 8 ft ) wide by 1 @.@ 95 metres ( 6 @.@ 4 ft ) deep ; it weighs 24 tons . The front of the headdress is decorated with three motifs that apparently represent the claws or fangs of an animal . Above these symbols is an angular U @-@ shaped decoration descending from the scalp . On each side of the monument a strap descends from the headdress , passing in front of the ear . Each ear has a prominent ear ornament that descends from the earlobe to the base of the monument . The features are those of a mature man , with wrinkles around the mouth , eyes and nose . Monument 1 is the best preserved head at La Venta but has suffered from erosion , particularly at the back . The head was first described by Franz Blom and Oliver La Farge who investigated the La Venta remains on behalf of Tulane University in 1925 . When discovered it was half @-@ buried ; its massive size meant that the discoverers were unable to excavate it completely . Matthew Stirling fully excavated the monument in 1940 , after clearing the thick vegetation that had covered it in the intervening years . Monument 1 has been moved to the Parque @-@ Museo La Venta in Villahermosa . The head was found in its original context ; associated finds have been radiocarbon dated to between 1000 and 600 BC . 
- La Venta Monument 2 measures 1 @.@ 63 metres ( 5 @.@ 3 ft ) high by 1 @.@ 35 metres ( 4 @.@ 4 ft ) wide by 0 @.@ 98 metres ( 3 @.@ 2 ft ) deep ; the head weighs 11 @.@ 8 tons . The face has a broadly smiling expression that reveals four of the upper teeth . The cheeks are given prominence by the action of smiling ; the brow that is normally visible in other heads is covered by the rim of the headdress . The face is badly eroded , distorting the features . In addition to the severe erosion damage , the upper lip and a part of the nose have been deliberately mutilated . The head was found in its original context a few metres north of the northwest corner of pyramid @-@ platform A @-@ 2 . Radiocarbon dating of the monument 's context dates it to between 1000 and 600 BC . Monument 2 has suffered erosion damage from its exposure to the elements prior to discovery . The head has a prominent headdress but this is badly eroded and any individual detail has been erased . A strap descends in front of the ear on each side of the head , descending as far as the earlobe . The head is adorned with ear ornaments in the form of a disc that covers the earlobe , with an associated clip or peg . The surviving details of the headdress and earflares are stylistically similar to those of Tres Zapotes Monument A. The head has been moved to the Museo del Estado de Tabasco in Villahermosa . 
- La Venta Monument 3 stands 1 @.@ 98 metres ( 6 @.@ 5 ft ) high and measures 1 @.@ 6 metres ( 5 @.@ 2 ft ) wide by 1 metre ( 3 @.@ 3 ft ) deep ; it weighs 12 @.@ 8 tons . Monument 3 was located a few metres to the east of Monument 2 , but was moved to the Parque @-@ Museo La Venta in Villahermosa . Like the other La Venta heads , its context has been radiocarbon dated to between 1000 and 600 BC . It appears unfinished and has suffered severe damage through weathering , making analysis difficult . It had a large headdress that reaches to the eyebrows but any details have been lost through erosion . Straps descend in front of each ear and continue to the base of the monument . The ears are wearing large flattened rings that overlap the straps ; they probably represent jade ornaments of a type that have been recovered in the Olmec region . Although most of the facial detail is lost , the crinkling of the bridge of the nose is still evident , a feature that is common to the frowning expressions of the other Olmec colossal heads . 
- La Venta Monument 4 measures 2 @.@ 26 metres ( 7 @.@ 4 ft ) high by 1 @.@ 98 metres ( 6 @.@ 5 ft ) wide and 1 @.@ 86 metres ( 6 @.@ 1 ft ) deep . It weighs 19 @.@ 8 tons . It was found a few metres to the west of Monument 2 and has been moved to the Parque @-@ Museo La Venta . As with the other heads in the group , its archaeological context has been radiocarbon dated to between 1000 and 600 BC . The headdress is elaborate and , although damaged , various details are still discernible . The base of the headdress is formed by three horizontal strips running over the forehead . One side is decorated with a double @-@ disc motif that may have been repeated on the other ; if so , damage to the right side has obliterated any trace of it . The top of the headdress is decorated with the clawed foot of a bird of prey . Either straps or plaits of hair descend on either side of the face , from the headdress to the base of the monument . Only one earspool survives ; it is flat , in the form of a rounded square , and is decorated with a cross motif . The ears have been completely eroded away and the lips are damaged . The surviving features display a frown and creasing around the nose and cheeks . The head displays prominent teeth . 
- 
- = = = Tres Zapotes = = = 
- 
- The two heads at Tres Zapotes , with the La Cobata head , are stylistically distinct from the other known examples . Beatriz de la Fuente views them as a late regional survival of an older tradition while other scholars argue that they are merely the kind of regional variant to be expected in a frontier settlement . These heads are sculpted with relatively simple headdresses ; they have squat , wide proportions and distinctive facial features . The two Tres Zapotes heads are the earliest known stone monuments from the site . The discovery of one of the Tres Zapotes heads in the nineteenth century led to the first archaeological investigations of Olmec culture , carried out by Matthew Stirling in 1938 . 
- Tres Zapotes Monument A ( also known as Tres Zapotes Colossal Head 1 ) was the first colossal head to be found , discovered by accident in the middle of the nineteenth century , 1 kilometre ( 0 @.@ 62 mi ) to the north of the modern village of Tres Zapotes . After its discovery it remained half @-@ buried until it was excavated by Matthew Stirling in 1939 . At some point it was moved to the plaza of the modern village , probably in the early 1960s . It has since been moved to the Museo Comunitario de Tres Zapotes . Monument A stands 1 @.@ 47 metres ( 4 @.@ 8 ft ) tall ; it measures 1 @.@ 5 metres ( 4 @.@ 9 ft ) wide by 1 @.@ 45 metres ( 4 @.@ 8 ft ) deep , and is estimated to weigh 7 @.@ 8 tons . The head is sculpted with a simple headdress with a wide band that is otherwise unadorned , and wears rectangular ear ornaments that project forwards onto the cheeks . The face is carved with deep creases between the cheeks and the nose and around the mouth ; the forehead is creased into a frown . The upper lip has suffered recent damage , with the left portion flaking away . 
- Tres Zapotes Monument Q ( also known as the Nestape Head and Tres Zapotes Colossal Head 2 ) measures 1 @.@ 45 metres ( 4 @.@ 8 ft ) high by 1 @.@ 34 metres ( 4 @.@ 4 ft ) wide by 1 @.@ 26 metres ( 4 @.@ 1 ft ) deep and weighs 8 @.@ 5 tons . Its exact date of discovery is unknown but is estimated to have been some time in the 1940s , when it was struck by machinery being used to clear vegetation from Nestape hill . Monument Q was the eleventh colossal head to be discovered . It was moved to the plaza of Santiago Tuxtla in 1951 and remains there to this day . Monument Q was first described by Williams and Heizer in an article published in 1965 . The headdress is decorated with a frontal tongue @-@ shaped ornament , and the back of the head is sculpted with seven plaits of hair bound with tassels . A strap descends from each side of the headdress , passing over the ears and to the base of the monument . The face has pronounced creases around the nose , mouth and eyes . 
- 
- = = = La Cobata = = = 
- 
- The La Cobata region was the source of the basalt used for carving all of the colossal heads in the Olmec heartland . The La Cobata colossal head was discovered in 1970 and was the fifteenth to be recorded . It was discovered in a mountain pass in the Sierra de los Tuxtlas , on the north side of El Vigia volcano near to Santiago Tuxtla . The head was largely buried when found ; excavations uncovered a Late Classic ( 600 – 900 AD ) offering associated with the head consisting of a ceramic vessel and a 12 @-@ centimetre ( 4 @.@ 7 in ) long obsidian knife placed pointing northwards towards the head . The offering is believed to have been deposited long after the head was sculpted . The La Cobata head has been moved from its original location to the main plaza at Santiago . 
- The La Cobata head is more or less rounded and measures 3 by 3 metres ( 9 @.@ 8 by 9 @.@ 8 ft ) by 3 @.@ 4 metres ( 11 ft ) high , making it the largest known head . This massive sculpture is estimated to weigh 40 tons . It is stylistically distinct from the other examples , and Beatriz de la Fuente placed it late in the Olmec time frame . The characteristics of the sculpture have led to some investigators suggesting that it represents a deceased person . Norman Hammond argues that the apparent stylistic differences of the monument stem from its unfinished state rather than its late production . The eyes of the monument are closed , the nose is flattened and lacks nostrils and the mouth was not sculpted in a realistic manner . The headdress is in the form of a plain horizontal band . 
- The original location of the La Cobata head was not a major archaeological site and it is likely that the head was either abandoned at its source or during transport to its intended destination . Various features of the head suggest that it was unfinished , such as a lack of symmetry below the mouth and an area of rough stone above the base . Rock was not removed from around the earspools as on other heads , and does not narrow towards the base . Large parts of the monument seem to be roughed out without finished detail . The right hand earspool also appears incomplete ; the forward portion is marked with a sculpted line while the rear portion has been sculpted in relief , probably indicating that the right cheek and eye area were also unfinished . The La Cobata head was almost certainly carved from a raw boulder rather than being sculpted from a throne . 
- 
- = = = Takalik Abaj = = = 
- 
- Takalik Abaj Monument 23 dates to the Middle Preclassic period , and is found in Takalik Abaj , an important city in the foothills of the Guatemalan Pacific coast , in the modern department of Retalhuleu . It appears to be an Olmec @-@ style colossal head re @-@ carved into a niche figure sculpture . If originally a colossal head then it would be the only known example from outside the Olmec heartland . 
- Monument 23 is sculpted from andesite and falls in the middle of the size range for confirmed colossal heads . It stands 1 @.@ 84 metres ( 6 @.@ 0 ft ) high and measures 1 @.@ 2 metres ( 3 @.@ 9 ft ) wide by 1 @.@ 56 metres ( 5 @.@ 1 ft ) deep . Like the examples from the Olmec heartland , the monument features a flat back . Lee Parsons contests John Graham 's identification of Monument 23 as a re @-@ carved colossal head ; he views the side ornaments , identified by Graham as ears , as rather the scrolled eyes of an open @-@ jawed monster gazing upwards . Countering this , James Porter has claimed that the re @-@ carving of the face of a colossal head into a niche figure is clearly evident . 
- Monument 23 was damaged in the mid @-@ twentieth century by a local mason who attempted to break its exposed upper portion using a steel chisel . As a result , the top is fragmented , although the broken pieces were recovered by archaeologists and have been put back into place . 
- 
- = = Collections = = 
- 
- All of the 17 confirmed colossal heads remain in Mexico . Two heads from San Lorenzo are on permanent display at the Museo Nacional de Antropología in Mexico City . Seven of the San Lorenzo heads are on display in the Museo de Antropología de Xalapa . Five of them are in Sala 1 , one is in Sala 2 and one is in Patio 1 . The remaining San Lorenzo head is in the Museo Comunitario de San Lorenzo Tenochtitlán near Texistepec . All four heads from La Venta are now in Villahermosa , the state capital of Tabasco . Three are in the Parque @-@ Museo La Venta and one in the Museo del Estado de Tabasco . Two heads are on display in the plaza of Santiago Tuxtla ; one from Tres Zapotes and the La Cobata Head . The other Tres Zapotes head is in the Museo Comunitario de Tres Zapotes . 
- Several colossal heads have been loaned to temporary exhibitions abroad ; San Lorenzo Colossal Head 6 was loaned to the Metropolitan Museum of Art in New York in 1970 . San Lorenzo colossal heads 4 and 8 were lent to the Olmec Art of Ancient Mexico exhibition in the National Gallery of Art , Washington , D.C. that ran from 30 June to 20 October 1996 . San Lorenzo Head 4 was again loaned in 2005 , this time to the de Young Museum in San Francisco . The de Young Museum was loaned San Lorenzo colossal heads 5 and 9 for its Olmec : Colossal Masterworks of Ancient Mexico exhibition , which ran from 19 February to 8 May 2011 . 
- 
- = = = Vandalism = = = 
- 
- On 12 January 2009 , at least three people , including two Mexicans and one American , entered the Parque @-@ Museo La Venta in Villahermosa and damaged just under 30 archaeological pieces , including the four La Venta colossal heads . The vandals were all members of an evangelical church and appeared to have been carrying out a supposed pre @-@ Columbian ritual , during which salts , grape juice and oil were thrown on the heads . It was estimated that 300 @,@ 000 pesos ( US $ 21 @,@ 900 ) would be needed to repair the damage , and the restoration process would last four months . The three vandals were released soon after their arrest after paying 330 @,@ 000 pesos each . 
- 
- = = = Replicas = = = 
- 
- Although not all of the replicas were placed by him , the majority of replicas around the world were placed under the leadership of Miguel Alemán Velasco , former governor of the state of Veracruz . The following is a list of replicas and their locations within the United States : 
- Austin , Texas . A replica of San Lorenzo Head 1 was placed in the Teresa Lozano Long Institute of Latin American Studies at the University of Texas in November 2008 . 
- Chicago , Illinois . A replica of San Lorenzo Head 8 made by Ignacio Perez Solano was placed in the Field Museum of Natural History in 2000 . 
- Covina , California . A replica of San Lorenzo Head 5 was donated to Covina in 1989 , originally intended to be placed in Jalapa Park . Due to concerns over potential vanadalism it was instead installed outside the police station . It was removed in 2011 and relocated to Jobe 's Glen , Jalapa Park in June 2012 . 
- McAllen , Texas . A replica of San Lorenzo Head 8 is located in the International Museum of Art & Science . The specific date placement is unknown , but it was dedicated by Fidel Herrera Beltrán , then governor of Veracruz , during his time in office between 2004 – 2010 . 
- New York . A replica of San Lorenzo Head 1 was placed next to the main plaza in the grounds of Lehman College in the Bronx , New York . It was installed in 2013 to celebrate the first anniversary of the CUNY Institute of Mexican Studies , housed at the college . The replica was a gift by the government of Verazruz state , Cumbre Tajín and Mexico Trade ; it was first plazed in Dag Hammerskjold Park , outside the United Nations , in 2012 . 
- San Francisco , California . A replica of San Lorenzo Head 1 created by Ignacio Perez Solano was placed in San Francisco City College , Ocean Campus in October 2004 . 
- Washington , D.C. A replica of San Lorenzo Head 4 sculpted by Ignacio Perez Solano was placed near the Constitution Avenue entrance of the Smithsonian National Museum of Natural History in October 2001 . 
- West Valley City , Utah . A replica of San Lorenzo Head 8 was placed in the Utah Cultural Celebration Center in May 2004 . 
- Mexico donated a resin replica of an Olmec colossal head to Belgium ; it is on display in the Musée du Cinquantenaire in Brussels . 
- In February 2010 , the Mexican Secretaría de Relaciones Exteriores ( Secretariat of Foreign Affairs ) announced that the Instituto Nacional de Antropología e Historia would be donating a replica Olmec colossal head to Ethiopia , to be placed in Plaza Mexico in Addis Ababa . 
- 
- 
- = Brad Stevens = 
- 
- Bradley Kent " Brad " Stevens ( born October 22 , 1976 ) is an American professional basketball head coach for the Boston Celtics of the NBA . He was previously the head coach at Butler University in Indianapolis . A former basketball player , he grew up in Zionsville , Indiana , where he starred on the Zionsville Community High School basketball team , setting four school records . After high school , he attended DePauw University , where he played basketball and earned a degree in economics . He made the all @-@ conference team multiple times and was a three @-@ time Academic All @-@ America nominee . 
- Stevens joined the Butler basketball program as a volunteer prior to the 2000 – 01 season after quitting his job at Eli Lilly and Company . He was promoted to a full @-@ time assistant coaching position for the 2001 – 02 season . On April 4 , 2007 , he became the head coach after Todd Lickliter left to coach the Iowa Hawkeyes . In his first year , Stevens led Butler to 30 wins , becoming the third @-@ youngest head coach in NCAA Division I history to have a 30 @-@ win season . 
- In 2010 , his third year as head coach , Stevens broke the NCAA record for most wins in a coach 's first three years , exceeding the previous record by eight . In the postseason , Stevens coached Butler to the first Final Four in school history . At 33 years old , Stevens became the second @-@ youngest head coach to make a NCAA National Championship game , losing 61 – 59 to Duke . Shortly after the season ended , he signed a contract extension with Butler through the 2011 – 12 season . With the 2010 – 11 team making the Final Four , Stevens became the youngest coach to go to two Final Fours . Stevens coached the Bulldogs in their second consecutive national championship game on April 4 , 2011 , where the team lost to the Huskies of the University of Connecticut . 
- Stevens is known for a calm , focused coaching style . He spends a lot of time analyzing opponents using statistical analysis , adding new wrinkles to his team 's play each game . He puts a strong emphasis on defensive and team oriented basketball . Butler 's success against teams with superior athletes has been attributed to Stevens ' coaching style and calm demeanor . Stevens has twice been named the Horizon League Coach of the Year and won collegeinsider.com 's Hugh Durham Award mid @-@ season honors in January 2009 . He has also been both a Hugh Durham Award and Jim Phelan Award finalist all three years of his career . Stevens has been called a coaching prodigy and compared to John Wooden . He is married with two young children . In July 2013 , he signed a six @-@ year , 22 million dollar contract to become the head coach of the Boston Celtics in the NBA . In April 2015 , Stevens led the Celtics to the NBA Playoffs as the 7th seed in the Eastern Conference with a 40 – 42 record . 
- 
- = = Early life = = 
- 
- Bradley Kent Stevens grew up in Zionsville , Indiana , where he developed his love for basketball . Starting at age five , Stevens would watch taped basketball games " before he went to afternoon kindergarten " . His father would often drive him to Bloomington , to watch Indiana Hoosiers games . " It 's hard not to be [ in love with basketball ] when you 're a kid growing up in Indiana " , Stevens later said . 
- For his eighth birthday , Stevens received a new basketball hoop . " It ’ s so much fun to dream in your driveway , " he later remarked . " That ’ s where my friends and I hung out . It was a lot of fun to grow up in that era . " When a friend , Brandon Monk , had a basketball court installed in his back yard , Stevens " appeared instantaneously . " He was so dedicated to the game that he would bring the unprepared ingredients for grilled cheese sandwiches over to Monk 's house , so that he would not waste time waiting for the sandwiches to cook . 
- Monk 's court soon became a gathering place , where Zionsville kids and kids from the surrounding areas would hold pickup games . These games helped develop Stevens ' competitive streak . Besides playing basketball , the young Stevens also enjoyed solving puzzles , a skill he later applied to analyzing opposing teams to find their weaknesses . 
- Stevens attended Zionsville Community High School , where he became a star basketball player . He wore No. 31 in high school in honor of Indiana Pacers guard Reggie Miller . During his freshman year , he would get up early to practice shooting at a local gym before school . The hard work paid off as Stevens made the varsity team that same year . By the time his high school career was complete , Stevens had set school records for career scoring , assists , steals , and three @-@ point field goals . As of 2010 , he still holds the records for points ( 1508 ) , assists ( 444 ) , and steals ( 156 ) , as well as the single @-@ season points record ( 644 in 1995 ) . Stevens was named to the all @-@ conference team three times . In 1995 , he was named the sectional MVP and was the leading scorer in state sectional play ( 32 @.@ 3 ppg ) . 
- Stevens made the academic all @-@ state first team and received the Straight A Gold Medal Award all four years . He was a member of the National Honor Society , graduating seventh in his class of 165 . He earned three letters in basketball , three in track , and one in baseball during his days at Zionsville . During summers , he traveled the country playing AAU basketball . 
- Although Stevens had a strong passion for the game , he realized that his basketball skills were modest and not likely to get him very far . As such , he chose to attend academically oriented DePauw University for college . During his stay , he played in all 101 DePauw games , earning four varsity letters . He earned multiple all @-@ conference and academic all @-@ conference awards , and was a three @-@ time Academic All @-@ America nominee . He was a team captain his senior year , and averaged more than 8 points per game three of his four years . His career highs were 24 points and 8 rebounds in a game . After his senior year , Stevens received the Coaches ’ Award . Coach Bill Fenlon later described Stevens as " one of the most selfless , team @-@ oriented person [ sic ] I 've ever been around . " 
- While at DePauw , Stevens was a member of the Management Fellows Honors Program and the DePauw Community Services ’ Sports Night executive board . He was also a brother of the Alpha Tau Omega fraternity . During summer vacations , Stevens spent time teaching at Butler basketball camps . He was named to the Dean 's list and graduated in 1999 with a degree in economics . 
- 
- = = College career = = 
- 
- In the summer of 2000 , Stevens was offered the opportunity to volunteer in the Butler basketball office . He ran the idea of quitting his job at Eli Lilly by then @-@ longtime girlfriend Tracy Wilhelmy . She thought about it for two hours before telling him to go for it . " Now , it looks like a great idea , " Stevens later remarked . " At the time , I thought it was something I really wanted to try . " Tracy went back to school to get a law degree that could support the couple if things did not work out for Brad . " We were 23 and realized this was our chance , " Tracy later said . " Five years down the road , we were probably not going to be in a position to do that . The more success you had at Lilly , the harder it would be to leave . " 
- Stevens planned to live in a friend 's basement and took a job at Applebee ’ s to pay the bills . Before he started training at Applebee 's , he was offered a low @-@ paying administrative position as coordinator of basketball operations under then @-@ coach Thad Matta . The position had opened up when assistant coach Jamal Meeks resigned after being arrested on solicitation and drug charges , of which he was later acquitted . Years later , Matta recalled , " [ Stevens ] was just a hungry young kid that was desperate to get into coaching . He had a great passion and was willing to take a risk to get into the coaching profession . " 
- After Matta left the school following the 2000 – 01 season , new head coach Todd Lickliter promoted Stevens to a full @-@ time assistant coach . Under Lickliter , Stevens was active in every aspect of the game : skills instruction , game preparation , in @-@ game coaching , and recruiting . Butler was 131 – 61 during Stevens ' time as an assistant coach . 
- 
- = = = Named head coach = = = 
- 
- On April 2 , 2007 , Lickliter resigned in order to take the head @-@ coaching position at the University of Iowa . The Butler players had a meeting with athletic director Barry Collier , urging him to promote from within . Collier , having spent the entire season observing the assistant coaches ' interaction with the team , agreed . The day after Lickliter resigned Stevens and Butler 's two other assistant coaches interviewed for the job . Within 24 hours of the interviews Stevens was named Butler 's new head coach . According to Collier , Stevens had something older , outside candidates could never match : six years of experience learning the Butler system , dubbed " The Butler Way " by Collier . " Age wasn 't a factor because I 'd seen his ability shine through during the course of the season , " Collier said . 
- 
- = = = 2007 – 08 season = = = 
- 
- At the start of the 2007 – 08 season , Stevens was the second youngest coach in Division I basketball . He got off to a fast start , winning his first eight games before falling to Wright State 43 – 42 . Legendary coach Bob Knight , whose Texas Tech team was an early victim , said " I wish we played as smart as they do . " Virginia Tech coach Seth Greenberg added " they 've got toughness about them and they expect to win . " 
- Midway through Stevens ' first season , with the Bulldogs at 12 – 1 , The New York Times wrote " so far , Stevens has made the transition [ to head coach ] look easy . " The Times went on to state that Stevens had the calm and composure of a seasoned veteran . " You ’ ve got a lot of people always looking for the next step . And that ’ s not what I was doing . I was just trying to figure out a way to win the next game and think like a head coach . " Stevens said . 
- Butler ended the regular season with a 27 – 3 record , taking first place in the Horizon League with a 16 – 2 in conference mark . The team beat Illinois @-@ Chicago 66 – 50 and Cleveland State 70 – 55 to claim the league 's tournament title and an automatic bid to the 2008 NCAA tournament . Butler was awarded the seven seed in the East Regional . They beat tenth @-@ seeded South Alabama 81 – 61 in the first round , before falling to second @-@ seeded Tennessee 76 – 71 in overtime . 
- Stevens ended up with a school and Horizon league record 30 wins , beating several big name schools – Michigan , Texas Tech , Florida State , Ohio State – along the way . In so doing , he became the third @-@ youngest head coach in NCAA Division I history to lead a team to 30 wins in a season , and became the fourth @-@ winningest first @-@ year coach . Butler was nationally ranked for a school and league record 19 consecutive weeks . Butler 's 30 – 4 record was the best among teams that did not reach the Final Four . Stevens was a finalist for the Hugh Durham Award , losing to Keno Davis of Drake , and a finalist for the Jim Phelan National Coach of the Year Award , losing to Bo Ryan . 
- At the conclusion of the season , Butler signed Stevens to a seven @-@ year contract . " We are extremely excited to reach this long @-@ term agreement to have Brad continue to lead our program , " Collier remarked . 
- 
- = = = 2008 – 09 season = = = 
- 
- Butler lost four starters after the 2007 – 08 season , and was picked to finish fifth in the Horizon league during the 2008 – 09 season . The team got off to a 12 – 1 start that won Stevens the Hugh Durham mid @-@ season coaching award . On February 5 , Stevens notched his 50th win as Butler beat Detroit 66 – 61 . In so doing , Stevens became the sixth head coach in NCAA history to reach 50 wins in 56 games or fewer . Butler finished first in the Horizon League with a 15 – 3 in conference record , defying preseason expectations . Butler lost the Horizon League tournament final 57 – 54 to Cleveland State , but made the NCAA tournament as an at @-@ large selection . The team received the nine seed in the South Regional , and lost to eighth @-@ seeded Louisiana State in the first round by a score of 75 – 71 to finish the year at 26 – 6 overall . 
- Stevens ' 56 – 10 two @-@ year record places him second only to Bill Guthridge ( 58 ) in total wins during one 's first two years as head coach . Stevens was a finalist for both the Hugh Durham and Jim Phelan Awards for the second straight year and was named the Horizon League Coach of the Year . He was also named as a finalist for the Henry Iba Coach of the Year Award . Stevens was given a one @-@ year contract extension at the conclusion of the season . 
- 
- = = = 2009 – 10 season = = = 
- 
- Fueled in large part by Gordon Hayward 's and Shelvin Mack 's roles in leading Team USA to the gold medal in the FIBA Under @-@ 19 World Championship during the off @-@ season , Butler began the season ranked 10th in the Coaches ' Poll and 11th in the AP Poll . A few commentators picked the Bulldogs as a possible " sleeper team " to make the Final Four . Stevens was not so sure , privately telling his father , " We have a really good team , and I ’ m not sure how far we can go this year , but next year , we ought to go really far . " 
- Butler got off to a mediocre start , losing twice in the 76 Classic 82 – 73 to 22nd @-@ ranked Minnesota and to 19th @-@ ranked Clemson 70 – 69 . After the tournament Butler 's record stood at 4 – 2 and the team dropped to # 23 in the AP Poll and # 20 in the Coaches ' Poll . Butler won its next two games before falling to 13th @-@ ranked Georgetown 72 – 65 in the Jimmy V Classic . The team won its next two games beating # 15 Ohio State 74 – 66 and edging out former conference rival Xavier 69 – 68 , both at home . After losing 67 – 57 at UAB three days later , Butler stood at 9 – 4 and fell out of the AP rankings . However , the team remained in the Coaches Poll at # 23 . 
- Stevens rallied the team , and they proceeded to win 16 straight games before facing Siena in a BracketBusters game . Butler beat Siena 70 – 53 and Stevens tied the NCAA record for most wins ( 81 ) by a head coach in his first three seasons set by Mark Few of Gonzaga in 2002 and tied by Mark Fox of Nevada in 2007 . 
- On February 26 , 2010 , Butler traveled to Valparaiso for their regular season finale . Leading scorer Gordon Hayward was sidelined with lower back pain , but the team still won 74 – 69 . In so doing , Stevens broke the coaching record he had tied the prior week and Butler completed an 18 – 0 undefeated conference schedule . It was Butler 's first undefeated conference record since joining the Horizon League , and first since Joe Sexson led the 1978 team to a 6 – 0 record in the now defunct Indiana Collegiate Conference . Stevens earned his third straight regular @-@ season conference championship . 
- In the Horizon league tournament , Stevens ' Bulldogs used their home @-@ court advantage to beat Milwaukee 68 – 59 in the semi @-@ finals and to beat Wright State 70 – 45 in the finals . The win earned the team an automatic bid into the 2010 NCAA tournament , and completed a 20 – 0 run through league play . Stevens became the first coach to lead a Horizon League team to both an undefeated regular season and conference tournament since the league was formed in 1979 . Stevens was also the only coach in Division I to lead his team to an undefeated conference schedule during the 2009 – 10 season . 
- 
- = = = = NCAA tournament = = = = 
- 
- For their season , the Bulldogs were ranked 8th in the final pre @-@ NCAA tournament Coaches ' Poll and 11th in the corresponding AP Poll . On Selection Sunday , the Bulldogs were seeded fifth in the West regional of the NCAA tournament and given a first @-@ round match up with twelfth seeded UTEP on March 18 . 
- Many basketball commentators picked UTEP to pull the upset , and at halftime it looked like they might be right , as UTEP led 33 – 27 . Stevens made a number of halftime adjustments , and the Bulldogs came out firing on all cylinders in the second half . The team dominated the second half and won the game 77 – 59 . Butler next faced off with thirteenth seeded Murray State . The game was close throughout , but Butler emerged victorious 54 – 52 when Hayward deflected a Murray State pass into the back court with less than five seconds on the clock . The win gave Stevens the first Sweet Sixteen appearance of his career . 
- On March 25 , 2010 , Butler faced top @-@ seeded Syracuse . The Bulldogs got off to a good start , jumping out to a 12 – 1 lead and a 35 – 25 halftime advantage . Syracuse rallied in the second half , taking its first lead of the game , 40 – 39 , off a Wes Johnson three @-@ pointer . Stevens called timeout and Butler regained the lead on its next possession , stopping the run . At the 5 : 32 mark , Syracuse got a rare fast break opportunity that ended with a dunk and 54 – 50 lead . Stevens again called time out and re @-@ focused the team . Butler responded by holding Syracuse scoreless for the next 5 minutes , taking a 60 – 54 lead with 0 : 59 to go . Butler held on to win 63 – 59 , advancing to the Elite Eight for the first time in school history . 
- Two days later , Stevens ' Bulldogs met second @-@ seeded Kansas State in the regional finals . Perhaps feeling the effects of their double overtime 101 – 96 win two days prior , Kansas State got off to a slow start , scoring just 20 points in the first half to trail 27 – 20 . Butler kept the lead in the upper single digits for most of the second half , before Kansas State went on a 13 – 2 run and took a 52 – 51 lead . Stevens immediately called time out and re @-@ focused the team . " Play your game . Just play your game , " he told them . On the ensuing possession , Butler regained the lead for good . They outscored Kansas State 12 – 4 the rest of the way and won the game 63 – 56 . In the post game celebration , Stevens and walk @-@ on forward Emerson Kampen connected on a flying back @-@ bump that became one of the iconic images of the tournament . 
- The win earned the Bulldogs a trip back to Indianapolis for the first Final Four appearance in school and Horizon League history . The win made Stevens , at age 33 , the youngest coach to lead a team to the Final Four since Bob Knight made his first Final Four appearance at age 32 in 1973 . Butler became the smallest school ( enrollment 4 @,@ 200 ) to make the Final Four since seeding began in 1979 . 
- 
- = = = = = Final Four = = = = = 
- 
- On April 3 , Brad Stevens and the Butler Bulldogs faced off with Michigan State in the national semi @-@ finals . Michigan State took an early 14 – 7 lead , and Matt Howard got in early foul trouble , sitting most the first half . Stevens kept the team focused with a " next man up " attitude and the game was tied at 28 at halftime . The second half was dominated by tight defense for both sides . With 2 : 45 to go in the game , the score was 47 – 44 Butler . Michigan State called a time out to set up a play . Stevens correctly anticipated the play call and had Ronald Nored , the team 's best defender , switch onto Korie Lucious off a screen . Nored stole the ball and Shawn Vanzant got fouled on the resulting run out , hitting 1 of 2 . Trailing 50 – 49 with under 30 seconds remaining , Michigan State came up empty and was forced to foul . Nored hit both foul shots , giving Butler a 52 – 49 lead . After a Michigan State time out , Stevens had his team foul Lucious with 2 seconds remaining to prevent a potentially game tying 3 @-@ pointer . After making the first , Lucious intentionally missed the second free throw . Hayward came down with the rebound to seal the victory . Butler became the first team since the shot clock was adopted for the 1985 – 86 season to hold five straight tournament opponents under 60 points . 
- On April 5 , 2010 , Butler and Duke faced off in what The New York Times called " the most eagerly awaited championship game in years " . Late in the first half , Duke went on an 8 – 0 run to take a 26 – 20 lead . Stevens called a timeout . With starters Matt Howard and Ronald Nored on the bench in foul trouble , Stevens was forced to call on backup center Avery Jukes who came up big for Butler . Jukes scored 10 first half points , tying his season high . At half time , Duke 's lead stood at 33 – 32 . 
- The second half was played very closely , with neither team taking a substantial lead . With 3 : 16 to play , Duke took a 60 – 55 lead on two made free throws by Nolan Smith . Butler cut the lead to one point in the final minute and , after a missed Kyle Singler jump shot with 36 seconds remaining , got a chance to retake the lead . Butler was unable to initiate their offense and Stevens called a timeout to set up a play . A failed inbounds attempt and a timeout later , Hayward missed a baseline fade @-@ away jumper and Brian Zoubek came down with the rebound for Duke . He was quickly fouled with less than 4 seconds remaining . Hayward narrowly missed a desperation half @-@ court shot court as time expired , making the final margin 61 – 59 . 
- The loss snapped Butler 's 25 @-@ game winning streak , the longest in school history . Butler became the smallest school to play for a National Championship since Jacksonville in 1970 . Stevens became the second @-@ youngest head coach to coach in the NCAA National Championship Game , behind Branch McCracken who led the Indiana Hoosiers to the 1940 National Championship at age 31 . Stevens was named as both a Hugh Durham and Jim Phelan Award finalist for the third consecutive year , losing to Mike Young and Jamie Dixon respectively . He was also a finalist for the Skip Prosser Man of the Year Award , which was won by Bob Marlin . 
- Butler finished the year ranked # 2 in the Coaches ' Poll , the highest ranking in school history . The school was ranked for 19 consecutive weeks , tying the school record . 
- 
- = = = 2010 off @-@ season = = = 
- 
- After the end of the 2009 – 10 season , Brad Stevens and Butler continued to attract considerable attention . President Barack Obama personally called Stevens to congratulate him on Butler 's season . David Letterman had Stevens on his show for a guest appearance . Butler admissions inquiries shot up 67 % . Stevens received fan letters from around the world , and his phone rang off the hook . He was invited to throw the ceremonial first pitch before the Chicago Cubs vs. Florida Marlins game in Chicago on May 10 . " It 's all been very surreal , " Stevens said . " If you are the runner @-@ up , you don 't expect to talk to the president . " " It 's been a little overwhelming , because I 'm a pretty simple guy , " he added . 
- The 2009 – 10 season also helped increase Butler 's recruiting profile . Asked if the increased fame would change things , Stevens said it better not spoil him or the university . " I look at this new challenge of not changing and sticking to your core values and making sure you remain humble as a great coaching opportunity . " 
- 
- = = = 2010 – 11 season = = = 
- 
- Rankings by ESPN 's Andy Katz and Fox Sports ' Jeff Goodman released shortly after the 2010 Championship game both had Butler third for the 2010 – 2011 season . Duke coach Mike Krzyzewski agreed , saying Butler would be " right up there , No. 1 or No. 2 ... They 'll be a favorite next year . " However , Hayward chose to leave early for the NBA Draft and Butler went through a rough patch early in the season , at one point losing three straight games and having a 6 – 5 conference record . Bolstered by the emergence of Andrew Smith at center and Matt Howard 's success as a perimeter forward , Butler ended up winning a share of the conference title at 13 – 5 . The Bulldogs then won the Horizon League Tournament to secure an automatic NCAA tournament bid , and received an 8 seed . 
- Picked by many to lose a first @-@ round match @-@ up against Old Dominion , Butler advanced on a last @-@ second tip @-@ in by Howard . Howard was also clutch in their next game , hitting a free throw with a less than one second remaining to beat Pitt in a dramatic finish . Shelvin Mack scored 30 points in the win . Butler won their next game when they defeated Wisconsin . On March 26 , 2011 , the Bulldogs beat Florida 74 – 71 in overtime to earn back @-@ to @-@ back trips to the Final Four . On April 2 , Butler beat fellow Cinderella team VCU 70 – 62 to make it to a second consecutive national championship game . For the second consecutive year , the Bulldogs fell in the national championship game , this time to Connecticut . 
- 
- = = = Coaching future = = = 
- 
- On April 8 , 2010 , Stevens signed a long @-@ term deal with Butler , extending his contract through the 2021 – 22 season . Financial terms of the contract were not disclosed ; however , Butler president Bobby Fong had publicly stated that the university could afford to increase Stevens ' base salary to approximately US $ 1 @,@ 000 @,@ 000 a few days prior . Stevens had previously made US $ 395 @,@ 000 plus benefits in base salary , a relatively low figure for a successful Division I head basketball coach . His total compensation for 2009 – 10 was estimated at US $ 750 @,@ 000 . Stevens had received a raise after each of his three seasons at Butler and his contract contains a buyout clause estimated in the high six or low seven figures . 
- By re @-@ signing with Butler , Stevens temporarily ended speculation that he would leave the university for a higher paying job . Oregon , Clemson , and Wake Forest were all said to be interested in offering Stevens multi @-@ million dollar contracts to leave Butler . " First and foremost , I ’ m loyal to Butler , " Stevens said . When asked if he would ever leave Butler , Stevens replied " I guess if they kicked me out . " 
- After the 2011 – 12 season , Stevens was pursued vigorously by Illinois to fill their coaching vacancy before he declined their offer . 
- In March 2013 , UCLA reportedly offered Stevens between $ 2 @.@ 5 and $ 3 million a year to leave Butler . Rumors circulated that Stevens was in contract negotiations with UCLA , but ultimately the rumors proved false and Stevens stayed at Butler . Commenting on the situation , a source close to Stevens said " Brad doesn 't understand why people would assume he 's leaving . " A few days later , Stevens reiterated that he was very happy at Butler and had no intentions to leave as long as he had the support of the university to continue running the program the " right way " . 
- 
- = = NBA = = 
- 
- 
- = = = Boston Celtics = = = 
- 
- On July 3 , 2013 , Stevens was signed as the head coach by the Boston Celtics . Reports state that his new contract is a six @-@ year , $ 22 million deal . In April 2015 , Stevens led the Celtics to the NBA Playoffs as the 7th seed in the Eastern Conference with a 40 – 42 record . On April 21 , 2015 , it was announced that Stevens finished fourth in voting for the NBA 's Coach of the Year Award . On March 1 , 2016 , Stevens was named the Eastern Conference Coach of the Month for games played during February . In April 2016 , Brad Stevens led the Celtics to their second consecutive playoff appearance under his tenure as the 5th seed in the 2016 NBA Playoffs finishing the season with a 48 – 34 record . On June 1 , 2016 , Stevens received a contract extension . 
- 
- = = Coaching style = = 
- 
- According to Stevens , in one of his first games as head coach , he was nervous and " felt like our team played on edge " because of it . He decided that a team 's play will reflect the mood of its coach ; a calm coach means a team that will remain poised in difficult game situations , while a nervous coach means a team that plays on edge . " I don ’ t want to lose a game because of my approach , " he told himself . Accordingly , he developed a strategy of always remaining calm and focused during games . He rarely raises his voice or gets emotional , instead quietly observing on the sideline with folded arms . He does not get upset about bad calls by referees or player mistakes , preferring to focus on " the next play " rather than what just happened . Butler player Willie Veasley explained Butler 's 2010 Final Four run by saying , " When those big runs [ by Syracuse and Kansas State ] came , Coach called a timeout and said a few calm words . Then he said he believes in us , he loves us and we 're going to win the game . " On the rare occasion Stevens feels the need to correct a player , he does it with " positive reinforcement , just at a little louder decibel " , according to former assistant coach Matthew Graves . Above all , Stevens wants his players to be confident , not living in fear of being yanked for making a bad play . 
- Externally , Stevens is always calm , but internally he is far from it . " I 'm not as calm as everybody thinks , " Stevens says . His wife Tracy adds , " He ’ s calm and collected , but he ’ s fiercely competitive . He ’ s always thinking about how he can beat you . " Former player Joel Cornette says " Everyone sees Brad as a level @-@ headed , calm and cool coach , but he ’ s about as competitive of a guy as I know . We would get into it constantly , whether playing two @-@ on @-@ two or arguing about players ’ having better college careers . " 
- Stevens spends a lot of time preparing for each game , and always tries to add a few new wrinkles specific to that game 's opponent . Sports Illustrated calls Stevens an expert " on breaking down tape and looking at statistical trends to find opponents ' weaknesses . " Former player Ronald Nored agrees : " We know everything we need to about our opponents , all their tendencies are broken down " ahead of time . 
- Stevens is a proponent of using statistical analysis to enhance his coaching decisions , spending almost as much time looking at statistics as watching game film . " I think it 's a unique way of looking at the game that may be able to help best communicate to your players " , he explains . For example , when Butler was slumping in late 2010 , Stevens challenged his team : " this [ 46 % defensive field goal percentage ] is where we are . This isn 't acceptable to get to where we want to go . But what does that really mean ? It 's not just get better defensively , it is , if we give up 3 less baskets a game , then we will be at 40 percent field goal percentage defense which will be top 20 in the country " . The team got the message , improved throughout the season , and ultimately went on a March run fueled by defense . In 2012 , Stevens became the first college coach to hire someone solely for statistical research when he added Drew Cannon to the staff . If he had the resources , Stevens says he would hire a team of statisticians to analyze the teams play . 
- Stevens ' teams are built around solid basketball fundamentals and good team work , rather than individual basketball skill . His teams are known for their defense , forcing opponents into uncharacteristic mistakes . The secret to basketball – and life – is " just to do the job to the best of your ability and don 't worry about anything else , " Stevens says . " Win the next game . Win the next possession . That 's our focus . It 's boring . It 's also the way championships are won " , he says . In short , Stevens is a strong believer in " The Butler Way " – doing all the little things that transform a group of good basketball players into a great basketball team . " I tell the players ' the Butler Way ' isn 't easy to define , " Stevens says , " but you can see it on the floor when we share the basketball , play with great energy and defend . " 
- Stevens prefers to recruit strong team players instead of going after " top recruits . " " The guys we [ have ] recruited , most of them weren 't very highly ranked , " Stevens says . " They had very good high school careers or careers at other places ( transfers ) , but for one reason or the other they weren 't seen as great players . But they all had intangibles . " Stevens puts a strong emphasis on education and has said he would only recruit a " one and done " player if he was committed to getting his degree while playing professionally . 
- Stevens has often been referred to as a coaching prodigy , but is not interested in self @-@ promotion . He instead prefers to deflect the praise he receives to the players , athletic department , and his mentors . He has not been known to posture for more money , or to leak his name for open coaching positions . He has been described as humble , modest , and not " about the money " . 
- The New York Times , USA Today , ESPN , and other commentators have attributed Butler 's success against teams with superior athletes to Stevens ' coaching style . The Times remarks , " the Bulldogs are very well prepared for their opponents , and they do not rattle easily " , and says that the resulting confidence has led to the team 's success . " He coaches to his personality and to his strengths , " Collier says . " Obviously , he has great rapport and communication ability with his team . " Yahoo ! Sports compared Stevens to legendary coach John Wooden writing " Brad Stevens is winning at Butler the Wooden way – calm and composed on the sideline . " Wooden agreed , saying , " I enjoy watching [ Stevens ] and very much enjoy [ Butler 's ] style of play . " 
- 
- = = Personal life = = 
- 
- Brad Stevens is known for his youthful looks , often being described as " baby @-@ faced " . One commentator remarked , " Stevens looks like he checks the mirror every morning to see if it 's time to start shaving . " On occasion , he has been mistaken for a player . He is also known for projecting a professional , " corporate " look from the sidelines . Asked what his life would be like if he had never taken up coaching , he replies " If everything else remained the same , I would have been as happy as heck ... Friends and family and faith , they 're going to take the cake over all this stuff . " Stevens met his wife , Tracy ( née Wilhelmy ) , while attending DePauw University . Tracy , who played soccer for DePauw , quickly learned of Brad 's love for basketball ; on their third date he drove her an hour and a half to attend a high school basketball game . Tracy graduated from Rocky River High School in 1995 , and from DePauw in 1999 . She returned to school in 2000 , driving five hours from Case Western 's law school to Indianapolis on weekends to see Brad . She finished her final year of law school in Indianapolis , and the couple married in August 2003 . Tracy works as a labor and employment lawyer . Tracy also serves as Brad 's agent . 
- Brad and Tracy Stevens are involved with the American Cancer Society 's Coaches Vs . Cancer . Brad says that the cause really hit home for them after Tracy 's mother died of the disease in June 2004 . The day before Butler 's 2010 Final Four appearance , they hosted a fundraiser for the organization . Brad Stevens has also volunteered his time to the Jukes Foundation for Kids , a charity benefiting Ugandan children run by former Butler player Avery Jukes . Stevens remains in close touch with the Butler basketball family ; he notably took a one @-@ game leave from the Celtics in January 2016 to visit with Andrew Smith , a player on both of Butler 's Final Four teams who was dying of cancer ; Smith died less than a week later . At the request of Andrew 's widow , Sam , Brad delivered the eulogy at the memorial service on January 17 , 2016 . 
- Stevens ' father , Mark , is an orthopedic surgeon in Indianapolis and former Indiana Hoosiers football player . His mother , Jan , is a university professor . She has previously taught at Butler . 
- 
- = = Head coaching record = = 
- 
- 
- = = = College = = = 
- 
- 
- = = = NBA = = = 
- 
- 
- = = = Awards and nominations = = = 
- 
- Henry Iba Coach of the Year Award finalist ( 2009 ) 
- Horizon League Coach of the Year ( 2009 , 2010 ) 
- Hugh Durham Award for Mid @-@ major Coach of the Year finalist ( 2008 , 2009 , 2010 ) 
- Hugh Durham Award Mid @-@ season honors ( 2009 ) 
- Jim Phelan National Coach of the Year Award finalist ( 2008 , 2009 , 2010 ) 
- Skip Prosser Man of the Year Award finalist ( 2010 ) 
- 
- 
- = Shackleton ( crater ) = 
- 
- Shackleton is an impact crater that lies at the south pole of the Moon . The peaks along the crater 's rim are exposed to almost continual sunlight , while the interior is perpetually in shadow ( a Crater of eternal darkness ) . The low @-@ temperature interior of this crater functions as a cold trap that may capture and freeze volatiles shed during comet impacts on the Moon . Measurements by the Lunar Prospector spacecraft showed higher than normal amounts of hydrogen within the crater , which may indicate the presence of water ice . The crater is named after Antarctic explorer Ernest Shackleton . 
- 
- = = Description = = 
- 
- The rotational axis of the Moon lies within Shackleton , only a few kilometers from its center . The crater is 21 km in diameter and 4 @.@ 2 km deep . From the Earth , it is viewed edge @-@ on in a region of rough , cratered terrain . It is located within the South Pole @-@ Aitken basin on a massif . The rim is slightly raised about the surrounding surface and it has an outer rampart that has been only lightly impacted . No significant craters intersect the rim , and it is sloped about 1 @.@ 5 ° toward the direction 50 – 90 ° from the Earth . The age of the crater is about 3 @.@ 6 billion years and it has been in the proximity of the south lunar pole for at least the last two billion years . 
- Because the orbit of the Moon is tilted only 5 ° from the ecliptic , the interior of this crater lies in perpetual darkness . Estimates of the area in permanent shadow were obtained from Earth @-@ based radar studies . Peaks along the rim of the crater are almost continually illuminated by sunlight , spending about 80 – 90 % of each lunar orbit exposed to the Sun . Continuously illuminated mountains have been termed peaks of eternal light and have been predicted to exist since the 1900s . 
- The shadowed portion of the crater was imaged with the Terrain Camera of the Japanese SELENE spacecraft using the illumination of sunlight reflected off the rim . The interior of the crater consists of a symmetrical 30 ° slope that leads down to a 6 @.@ 6 km diameter floor . The handful of craters along the interior span no more than a few hundred meters . The bottom is covered by an uneven mound @-@ like feature that is 300 to 400 m thick . The central peak is about 200 m in height . 
- The continuous shadows in the south polar craters cause the floors of these formations to maintain a temperature that never exceeds about 100 K. For Shackleton , the average temperature was determined to be about 90 K , reaching 88 K at the crater floor . Under these conditions , the estimated rate of loss from any ice in the interior would be 10 − 26 to 10 − 27 m / s . Any water vapor that arrives here following a cometary impact on the Moon would lie permanently frozen on or below the surface . However , the surface albedo of the crater floor matches the lunar far @-@ side , suggesting that there is no exposed surface ice . 
- This crater was named after Ernest Henry Shackleton , an Anglo @-@ Irish explorer of Antarctica from 1901 until his death in 1922 . The name was officially adopted by the International Astronomical Union in 1994 . Nearby craters of note include Shoemaker , Haworth , de Gerlache , Sverdrup , and Faustini . Somewhat farther away , on the eastern hemisphere of the lunar near side , are the larger craters Amundsen and Scott , named after two other early explorers of the Antarctic continent . 
- 
- = = Exploration = = 
- 
- From the perspective of the Earth , this crater lies along the southern limb of the Moon , making observation difficult . Detailed mapping of the polar regions and farside of the Moon did not occur until the advent of orbiting spacecraft . Shackleton lies entirely within the rim of the immense South Pole @-@ Aitken basin , which is one of the largest known impact formations in the Solar System . This basin is over 12 kilometers deep , and an exploration of its properties could provide useful information about the lunar interior . 
- A neutron spectrometer on board the Lunar Prospector spacecraft detected enhanced concentrations of hydrogen close to the northern and southern lunar poles , including the crater Shackleton . At the end of this mission in July 1999 , the spacecraft was crashed into the nearby crater Shoemaker in the hope of detecting from Earth @-@ based telescopes an impact @-@ generated plume containing water vapor . The impact event did not produce any detectable water vapor , and this may be an indication that the hydrogen is not in the form of hydrated minerals , or that the impact site did not contain any ice . Alternatively , it is possible that the crash did not excavate deeply enough into the regolith to liberate significant quantities of water vapor . 
- From Earth @-@ based radar and spacecraft images of the crater edge , Shackleton appears to be relatively intact ; much like a young crater that has not been significantly eroded from subsequent impacts . This may mean that the inner sides are relatively steep , which may make traversing the sides relatively difficult for a robotic vehicle . In addition , it is possible that the interior floor might not have collected a significant quantity of volatiles since its formation . However other craters in the vicinity are considerably older , and may contain significant deposits of hydrogen , possibly in the form of water ice . ( See Shoemaker ( lunar crater ) , for example . ) 
- Radar studies preceding and following the Lunar Prospector mission demonstrate that the inner walls of Shackleton are similar in reflective characteristics to those of some sunlit craters . In particular , the surroundings appear to contain a significant number of blocks in its ejecta blanket , suggesting that its radar properties are a result of surface roughness , and not ice deposits , as was previously suggested from a radar experiment involving the Clementine mission . This interpretation , however , is not universally agreed upon within the scientific community . Radar images of the crater at a wavelength of 13 cm show no evidence for water ice deposits . 
- Optical imaging inside the crater was done for the first time by the Japanese lunar orbiter spacecraft Kaguya in 2007 . It did not have any evidence of significant amount of water ice , down to the image resolution of 10 m per pixel . 
- On November 15 , 2008 , a 34 @-@ kg probe made a hard landing near the crater . The moon impact probe ( MIP ) was launched from the Indian Chandrayaan @-@ I spacecraft and reached the surface 25 minutes later . The probe carried a radar altimeter , video imaging system , and a mass spectrometer , which will be used to search for water . 
- 
- = = Potential uses = = 
- 
- Some sites along Shackleton 's rim receive almost constant illumination . At these locales sunlight is almost always available for conversion into electricity using solar panels , potentially making them good locations for future Moon landings . The temperature at this site is also more favorable than at more equatorial latitudes as it does not experience the daily temperature extremes of 100 ° C when the Sun is overhead , to as low as − 150 ° C during the lunar night . 
- While scientific experiments performed by Clementine and Lunar Prospector could indicate the presence of water in the polar craters , the current evidence is far from definitive . There are doubts among scientists as to whether or not the hydrogen is in the form of ice , as well as to the concentration of this " ore " with depth below the surface . Resolution of this issue will require future missions to the Moon . The presence of water suggests that the crater floor could potentially be " mined " for deposits of hydrogen in water form , a commodity that is expensive to deliver directly from the Earth . 
- This crater has also been proposed as a future site for a large infrared telescope . The low temperature of the crater floor makes it ideal for infrared observations , and solar cells placed along the rim could provide near @-@ continuous power to the observatory . About 120 kilometers from the crater lies the 5 @-@ km tall Malapert Mountain , a peak that is perpetually visible from the Earth , and which could serve as a radio relay station when suitably equipped . 
- NASA has named the rim of Shackleton as a potential candidate for its lunar outpost , slated to be up and running by 2020 and continuously staffed by a crew by 2024 . The location would promote self @-@ sustainability for lunar residents , as perpetual sunlight on the south pole would provide energy for solar panels . Furthermore , the shadowed polar regions are believed to contain the frozen water necessary for human consumption and could also be harvested for fuel manufacture . 
- 
- 
- = American Beauty ( 1999 film ) = 
- 
- American Beauty is a 1999 American drama film directed by Sam Mendes and written by Alan Ball . Kevin Spacey stars as Lester Burnham , a 42 @-@ year @-@ old advertising executive who has a midlife crisis when he becomes infatuated with his teenaged daughter 's best friend , Angela ( Mena Suvari ) . Annette Bening co @-@ stars as Lester 's materialistic wife , Carolyn , and Thora Birch plays their insecure daughter , Jane . Wes Bentley , Chris Cooper , and Allison Janney also feature . The film is described by academics as a satire of American middle @-@ class notions of beauty and personal satisfaction ; analysis has focused on the film 's explorations of romantic , and paternal love , sexuality , beauty , materialism , self @-@ liberation , and redemption . 
- Ball began writing American Beauty as a play in the early 1990s , partly inspired by the media circus around the Amy Fisher trial in 1992 . He shelved the play after realizing the story would not work on stage . After several years as a television screenwriter , Ball revived the idea in 1997 when attempting to break into the film industry . The modified script had a cynical outlook that was influenced by Ball 's frustrating tenures writing for several sitcoms . Producers Dan Jinks and Bruce Cohen took American Beauty to DreamWorks ; the then @-@ fledgling film studio bought Ball 's script for $ 250 @,@ 000 , outbidding several other production bodies . DreamWorks financed the $ 15 million production and served as its North American distributor . American Beauty marked acclaimed theater director Mendes ' film debut ; courted after his successful productions of the musicals Oliver ! and Cabaret , Mendes was , nevertheless , only given the job after 20 others were considered and several " A @-@ list " directors turned down the opportunity . 
- Spacey was Mendes ' first choice for the role of Lester , though DreamWorks had urged the director to consider better @-@ known actors ; similarly , the studio suggested several actors for the role of Carolyn until Mendes offered the part to Bening without DreamWorks ' knowledge . Principal photography took place between December 1998 and February 1999 on soundstages at the Warner Bros. backlot in Burbank , California , and on location in Los Angeles . Mendes ' dominant style was deliberate and composed ; he made extensive use of static shots and slow pans and zooms to generate tension . Cinematographer Conrad Hall complemented Mendes ' style with peaceful shot compositions to contrast with the turbulent on @-@ screen events . During editing , Mendes made several changes that gave the film a less cynical tone than the script . 
- Released in North America on September 17 , 1999 , American Beauty was positively received by critics and audiences ; it was the best @-@ reviewed American film of the year and grossed over $ 356 million worldwide . Reviewers praised most aspects of the production , with particular emphasis on Mendes , Spacey , and Ball ; criticism focused on the familiarity of the characters and setting . DreamWorks launched a major campaign to increase the film 's chances of Academy Award success ; at the 72nd Academy Awards the following year , the film won Best Picture , Best Director , Best Actor ( for Spacey ) , Best Original Screenplay , and Best Cinematography . It was nominated for and won many other awards and honors , mainly for the direction , writing , and acting . 
- 
- = = Plot = = 
- 
- Lester Burnham is a middle @-@ aged advertising executive and magazine writer who despises his job . He is unhappily married to Carolyn , a neurotic yet fiercely ambitious real estate broker ; their teenaged daughter , Jane , abhors her parents and has low self @-@ esteem . The Burnhams ' new neighbors are retired United States Marine Corps Colonel Frank Fitts and his near @-@ catatonic wife , Barbara . Their teenaged son , Ricky , constantly films his surroundings with a camcorder , collecting hundreds of recordings on videotapes in his bedroom . His job as a part @-@ time bar caterer serves as a front for his secret marijuana dealing . Col. Fitts is a strict disciplinarian who previously sent Ricky to a military school and briefly committed him to a psychiatric hospital . Jim Olmeyer and Jim Berkley , a gay couple who live nearby , welcome the family to the neighborhood ; the homophobic Col. Fitts angrily asks Ricky " why these faggots have to rub it in your face . " 
- Lester becomes infatuated with Jane 's vain friend , Angela Hayes , after seeing her perform a half @-@ time dance routine at a high school basketball game . He starts having sexual fantasies about Angela , in which red rose petals are a recurring motif . Carolyn begins an affair with her married business rival , Buddy Kane . When Lester 's boss , Brad , tells him that he is to be laid off , Lester instead blackmails him for $ 60 @,@ 000 and quits his job . Lester takes a minimum @-@ wage job at a fast @-@ food restaurant , trades in his Toyota Camry for his dream car , a 1970 Pontiac Firebird , and starts working out after he overhears Angela tell Jane that she would find him sexually attractive if he got in shape . He begins smoking marijuana supplied by Ricky . The girls ' friendship wanes after Jane starts a relationship with Ricky . Jane and Ricky bond over what Ricky considers the most beautiful imagery he has filmed : a plastic bag being blown in the wind . 
- Lester discovers Carolyn 's infidelity , but reacts indifferently . Buddy ends the affair , fearing an expensive divorce . Col. Fitts becomes suspicious of Lester and Ricky 's friendship when he finds his son 's footage of Lester lifting weights while nude , which Ricky captured by chance , leading him to believe that Ricky is gay . After spying on Ricky and Lester through Lester 's garage window , the colonel mistakenly concludes the pair is sexually involved . He later confronts and beats Ricky for the supposed affair and accuses him of being gay . Ricky falsely admits the charges and goads his father into kicking him out of their home . Meanwhile , Carolyn is sitting in her car in the rain , taking a gun out of the glove box while a voice on the radio talks about not being a victim . Ricky goes to Jane 's bedroom , finding her arguing with Angela about Angela 's flirtation with Lester . Ricky convinces Jane to flee with him to New York City and assures Angela that she is ugly , boring , and ordinary . 
- Col. Fitts confronts Lester and attempts to kiss him ; Lester rebuffs the colonel , who tearfully flees . Carolyn puts the gun in her handbag , shouting , " I refuse to be a victim ! " Lester finds a distraught Angela sitting alone in the dark ; she asks him to tell her she is beautiful . He does , and they kiss . 
- Carolyn drives through the rain , rehearsing a confession to Lester . Just as Lester and Angela are about to have sex , she admits that she is a virgin , and Lester changes his mind . He instead comforts her and the pair bond over their shared frustrations . Angela goes to the bathroom and Lester smiles at a family photograph in his kitchen . An unseen figure raises a gun to the back of his head , a gunshot sounds , and blood sprays on the wall . Ricky and Jane find Lester 's body , while Carolyn breaks down crying in the closet . A bloodied Col. Fitts returns home , where a gun is shown to be missing from his collection . Lester 's closing narration describes meaningful experiences during his life ; he says that , despite his death , he is happy because there is " so much beauty " in the world . 
- 
- = = Themes and analysis = = 
- 
- 
- = = = Multiple interpretations = = = 
- 
- Scholars and academics have offered many possible readings of American Beauty ; film critics are similarly divided , not so much about the quality of the film , as their interpretations of it . Described by many as about " the meaning of life " or " the hollow existence of the American suburbs " , the film has defied categorization by even the filmmakers . Mendes is indecisive , saying the script seemed to be about something different each time he read it : " a mystery story , a kaleidoscopic journey through American suburbia , a series of love stories ; [ ... ] it was about imprisonment , [ ... ] loneliness , [ and ] beauty . It was funny ; it was angry , sad . " The literary critic and author Wayne C. Booth concludes that the film resists any one interpretation : " [ American Beauty ] cannot be adequately summarized as ' here is a satire on what 's wrong with American life ' ; that plays down the celebration of beauty . It is more tempting to summarize it as ' a portrait of the beauty underlying American miseries and misdeeds ' , but that plays down the scenes of cruelty and horror , and Ball 's disgust with our mores . It cannot be summarized with either Lester or Ricky 's philosophical statements about what life is or how one should live . " He argues that the problem of interpreting the film is tied with that of finding its center — a controlling voice who " [ unites ] all of the choices " . He contends that in American Beauty 's case , it is neither Mendes nor Ball . Mendes considers the voice to be Ball 's , but even while the writer was " strongly influential " on set , he often had to accept deviations from his vision , particularly ones that transformed the cynical tone of his script into something more optimistic . With " innumerable voices intruding on the original author 's , " Booth says , those who interpret American Beauty " have forgotten to probe for the elusive center " . According to Booth , the film 's true controller is the creative energy " that hundreds of people put into its production , agreeing and disagreeing , inserting and cutting " . 
- 
- = = = Imprisonment and redemption = = = 
- 
- Mendes called American Beauty a rite of passage film about imprisonment and escape from imprisonment . The monotony of Lester 's existence is established through his gray , nondescript workplace and characterless clothing . In these scenes , he is often framed as if trapped , " reiterating rituals that hardly please him " . He masturbates in the confines of his shower ; the shower stall evokes a jail cell and the shot is the first of many where Lester is confined behind bars or within frames , such as when he is reflected behind columns of numbers on a computer monitor , " confined [ and ] nearly crossed out " . The academic and author Jody W. Pennington argues that Lester 's journey is the story 's center . His sexual reawakening through meeting Angela is the first of several turning points as he begins to " [ throw ] off the responsibilities of the comfortable life he has come to despise " . After Lester shares a joint with Ricky , his spirit is released and he begins to rebel against Carolyn . Changed by Ricky 's " attractive , profound confidence " , Lester is convinced that Angela is attainable and sees that he must question his " banal , numbingly materialist suburban existence " ; he takes a job at a fast @-@ food outlet , which allows him to regress to a point when he could " see his whole life ahead of him " . 
- When Lester is caught masturbating by Carolyn , his angry retort about their lack of intimacy is the first time he says aloud what he thinks about her . By confronting the issue and Carolyn 's " superficial investments in others " , Lester is trying to " regain a voice in a home that [ only respects ] the voices of mother and daughter " . His final turning point comes when Angela and he almost have sex ; after she confesses her virginity , he no longer thinks of her as a sex object , but as a daughter . He holds her close and " wraps her up " . Mendes called it " the most satisfying end to [ Lester 's ] journey there could possibly have been " . With these final scenes , Mendes intended to show Lester at the conclusion of a " mythical quest " . After Lester gets a beer from the refrigerator , the camera pushes toward him , then stops facing a hallway down which he walks " to meet his fate " . Having begun to act his age again , Lester achieves closure . As he smiles at a family photo , the camera pans slowly from Lester to the kitchen wall , onto which blood spatters as a gunshot rings out ; the slow pan reflects the peace of Lester 's death . His body is discovered by Jane and Ricky . Mendes said that Ricky 's staring into Lester 's dead eyes is " the culmination of the theme " of the film : that beauty is found where it is least expected . 
- 
- = = = Conformity and beauty = = = 
- 
- Like other American films of 1999 — such as Fight Club , Bringing Out the Dead , and Magnolia , American Beauty instructs its audience to " [ lead ] more meaningful lives " . The film argues the case against conformity , but does not deny that people need and want it ; even the gay characters just want to fit in . Jim and Jim , the Burnhams ' other neighbors , are a satire of " gay bourgeois coupledom " , who " [ invest ] in the numbing sameness " that the film criticizes in heterosexual couples . The feminist academic and author Sally R. Munt argues that American Beauty uses its " art house " trappings to direct its message of nonconformity primarily to the middle classes , and that this approach is a " cliché of bourgeois preoccupation ; [ ... ] the underlying premise being that the luxury of finding an individual ' self ' through denial and renunciation is always open to those wealthy enough to choose , and sly enough to present themselves sympathetically as a rebel . " 
- Professor Roy M. Anker argues that the film 's thematic center is its direction to the audience to " look closer " . The opening combines an unfamiliar viewpoint of the Burnhams ' neighborhood with Lester 's narrated admission that he will soon die , forcing audiences to consider their own mortality and the beauty around them . It also sets a series of mysteries ; Anker asks , " from what place exactly , and from what state of being , is he telling this story ? If he 's already dead , why bother with whatever it is he wishes to tell about his last year of being alive ? There is also the question of how Lester has died — or will die . " Anker believes the preceding scene — Jane 's discussion with Ricky about the possibility of his killing her father — adds further mystery . Professor Ann C. Hall disagrees ; she says by presenting an early resolution to the mystery , the film allows the audience to put it aside " to view the film and its philosophical issues " . Through this examination of Lester 's life , rebirth and death , American Beauty satirizes American middle class notions of meaning , beauty and satisfaction . Even Lester 's transformation only comes about because of the possibility of sex with Angela ; he therefore remains a " willing devotee of the popular media 's exaltation of pubescent male sexuality as a sensible route to personal wholeness " . Carolyn is similarly driven by conventional views of happiness ; from her belief in " house beautiful " domestic bliss to her car and gardening outfit , Carolyn 's domain is a " fetching American millennial vision of Pleasantville , or Eden " . The Burnhams are unaware that they are " materialists philosophically , and devout consumers ethically " who expect the " rudiments of American beauty " to give them happiness . Anker argues that " they are helpless in the face of the prettified economic and sexual stereotypes [ ... ] that they and their culture have designated for their salvation . " 
- The film presents Ricky as its " visionary , [ ... ] spiritual and mystical center " . He sees beauty in the minutiae of everyday life , videoing as much as he can for fear of missing it . He shows Jane what he considers the most beautiful thing he has filmed : a plastic bag , tossing in the wind in front of a wall . He says capturing the moment was when he realized that there was " an entire life behind things " ; he feels that " sometimes there 's so much beauty in the world I feel like I can 't take it ... and my heart is going to cave in . " Anker argues that Ricky , in looking past the " cultural dross " , has " [ grasped ] the radiant splendor of the created world " to see God . As the film progresses , the Burnhams move closer to Ricky 's view of the world . Lester only forswears personal satisfaction at the film 's end . On the cusp of having sex with Angela , he returns to himself after she admits her virginity . Suddenly confronted with a child , he begins to treat her as a daughter ; in doing so , Lester sees himself , Angela , and his family " for the poor and fragile but wondrous creatures they are " . He looks at a picture of his family in happier times , and dies having had an epiphany that infuses him with " wonder , joy , and soul @-@ shaking gratitude " — he has finally seen the world as it is . 
- According to Patti Bellantoni , colors are used symbolically throughout the film , none more so than red , which is an important thematic signature that drives the story and " [ defines ] Lester 's arc " . First seen in drab colors that reflect his passivity , Lester surrounds himself with red as he regains his individuality . The American Beauty rose is repeatedly used as symbol ; when Lester fantasizes about Angela , she is usually naked and surrounded by rose petals . In these scenes , the rose symbolizes Lester 's desire for her . When associated with Carolyn , the rose represents a " façade for suburban success " . Roses are included in almost every shot inside the Burnhams ' home , where they signify " a mask covering a bleak , unbeautiful reality " . Carolyn feels that " as long as there can be roses , all is well " . She cuts the roses and puts them in vases , where they adorn her " meretricious vision of what makes for beauty " and begin to die . The roses in the vase in the Angela – Lester seduction scene symbolize Lester 's previous life and Carolyn ; the camera pushes in as Lester and Angela get closer , finally taking the roses — and thus Carolyn — out of the shot . Lester 's epiphany at the end of the film is expressed by rain and the use of red , building to a crescendo that is a deliberate contrast to the release Lester feels . The constant use of red " lulls [ the audience ] subliminally " into becoming used to it ; consequently , it leaves the audience unprepared when Lester is shot and his blood spatters on the wall . 
- 
- = = = Sexuality and repression = = = 
- 
- Pennington argues that American Beauty defines its characters through their sexuality . Lester 's attempts to relive his youth are a direct result of his lust for Angela , and the state of his relationship with Carolyn is in part shown through their lack of sexual contact . Also sexually frustrated , Carolyn has an affair that takes her from " cold perfectionist " to a more carefree soul who " [ sings ] happily along with " the music in her car . Jane and Angela constantly reference sex , through Angela 's descriptions of her supposed sexual encounters and the way the girls address each other . Their nude scenes are used to communicate their vulnerability . By the end of the film , Angela 's hold on Jane has weakened until the only power she has over her friend is Lester 's attraction to her . Col. Fitts reacts with disgust to meeting Jim and Jim ; he asks , " How come these faggots always have to rub it in your face ? How can they be so shameless ? " To which Ricky replies , " That 's the thing , Dad — they don 't feel like it 's anything to be ashamed of . " Pennington argues that Col. Fitts ' reaction is not homophobic , but an " anguished self @-@ interrogation " . 
- With other turn @-@ of @-@ the @-@ millennium films such as Fight Club , In the Company of Men ( 1997 ) , American Psycho ( 2000 ) , and Boys Don 't Cry ( 1999 ) , American Beauty " raises the broader , widely explored issue of masculinity in crisis " . Professor Vincent Hausmann charges that in their reinforcement of masculinity " against threats posed by war , by consumerism , and by feminist and queer challenges " , these films present a need to " focus on , and even to privilege " aspects of maleness " deemed ' deviant ' " . Lester 's transformation conveys " that he , and not the woman , has borne the brunt of [ lack of being ] " and he will not stand for being emasculated . Lester 's attempts to " strengthen traditional masculinity " conflict with his responsibilities as a father . Although the film portrays the way Lester returns to that role positively , he does not become " the hypermasculine figure implicitly celebrated in films like Fight Club " . Hausmann concludes that Lester 's behavior toward Angela is " a misguided but nearly necessary step toward his becoming a father again " . 
- Hausmann says the film " explicitly affirms the importance of upholding the prohibition against incest " ; a recurring theme of Ball 's work is his comparison of the taboos against incest and homosexuality . Instead of making an overt distinction , American Beauty looks at how their repression can lead to violence . Col. Fitts is so ashamed of his homosexuality that it drives him to murder Lester . Ball said , " The movie is in part about how homophobia is based in fear and repression and about what [ they ] can do . " The film implies two unfulfilled incestuous desires : Lester 's pursuit of Angela is a manifestation of his lust for his own daughter , while Col. Fitts ' repression is exhibited through the almost sexualized discipline with which he controls Ricky . Consequently , Ricky realizes that he can only hurt his father by falsely telling him he is homosexual , while Angela 's vulnerability and submission to Lester reminds him of his responsibilities and the limits of his fantasy . Col. Fitts represents Ball 's father , whose repressed homosexual desires led to his own unhappiness . Ball rewrote Col. Fitts to delay revealing him as homosexual , which Munt reads as a possible " deferment of Ball 's own patriarchal @-@ incest fantasies " . 
- 
- = = = Temporality and music = = = 
- 
- American Beauty follows a traditional narrative structure , only deviating with the displaced opening scene of Jane and Ricky from the middle of the story . Although the plot spans one year , the film is narrated by Lester at the moment of his death . Jacqueline Furby says that the plot " occupies [ ... ] no time [ or ] all time " , citing Lester 's claim that life did not flash before his eyes , but that it " stretches on forever like an ocean of time " . Furby argues that a " rhythm of repetition " forms the core of the film 's structure . For example , two scenes have the Burnhams sitting down to an evening meal , shot from the same angle . Each image is broadly similar , with minor differences in object placement and body language that reflect the changed dynamic brought on by Lester 's new @-@ found assertiveness . Another example is the pair of scenes in which Jane and Ricky film each other . Ricky films Jane from his bedroom window as she removes her bra , and the image is reversed later for a similarly " voyeuristic and exhibitionist " scene in which Jane films Ricky at a vulnerable moment . 
- Lester 's fantasies are emphasized by slow- and repetitive @-@ motion shots ; Mendes uses double @-@ and @-@ triple cutbacks in several sequences , and the score alters to make the audience aware that it is entering a fantasy . One example is the gymnasium scene — Lester 's first encounter with Angela . While the cheerleaders perform their half @-@ time routine to " On Broadway " , Lester becomes increasingly fixated on Angela . Time slows to represent his " voyeuristic hypnosis " and Lester begins to fantasize that Angela 's performance is for him alone . " On Broadway " — which provides a conventional underscore to the onscreen action — is replaced by discordant , percussive music that lacks melody or progression . This nondiegetic score is important to creating the narrative stasis in the sequence ; it conveys a moment for Lester that is stretched to an indeterminate length . The effect is one that Stan Link likens to " vertical time " , described by the composer and music theorist Jonathan Kramer as music that imparts " a single present stretched out into an enormous duration , a potentially infinite ' now ' that nonetheless feels like an instant " . The music is used like a visual cue , so that Lester and the score are staring at Angela . The sequence ends with the sudden reintroduction of " On Broadway " and teleological time . 
- According to Drew Miller of Stylus , the soundtrack " [ gives ] unconscious voice " to the characters ' psyches and complements the subtext . The most obvious use of pop music " accompanies and gives context to " Lester 's attempts to recapture his youth ; reminiscent of how the counterculture of the 1960s combated American repression through music and drugs , Lester begins to smoke cannabis and listen to rock music . Mendes ' song choices " progress through the history of American popular music " . Miller argues that although some may be over familiar , there is a parodic element at work , " making good on [ the film 's ] encouragement that viewers look closer " . Toward the end of the film , Thomas Newman 's score features more prominently , creating " a disturbing tempo " that matches the tension of the visuals . The exception is " Don 't Let It Bring You Down " , which plays during Angela 's seduction of Lester . At first appropriate , its tone clashes as the seduction stops . The lyrics , which speak of " castles burning " , can be seen as a metaphor for Lester 's view of Angela — " the rosy , fantasy @-@ driven exterior of the ' American Beauty ' " — as it burns away to reveal " the timid , small @-@ breasted girl who , like his wife , has willfully developed a false public self " . 
- 
- = = Production = = 
- 
- 
- = = = Development = = = 
- 
- In 1997 , Alan Ball resolved to move into the film industry after several frustrating years writing for the television sitcoms Grace Under Fire and Cybill . He joined the United Talent Agency , where his representative , Andrew Cannava , suggested he write a spec script to " reintroduce [ himself ] to the town as a screenwriter " . Ball pitched three ideas to Cannava : two conventional romantic comedies and American Beauty , which he had originally conceived as a play in the early 1990s . Despite the story 's lack of an easily marketable concept , Cannava selected American Beauty because he felt it was the one for which Ball had the most passion . While developing the script , Ball created another television sitcom , Oh , Grow Up . He channeled his anger and frustration at having to accede to network demands on that show — and during his tenures on Grace Under Fire and Cybill — into writing American Beauty . 
- Ball did not expect to sell the script , believing it would act as more of a calling card , but American Beauty drew interest from several production bodies . Cannava passed the script to several producers , including Dan Jinks and Bruce Cohen , who took it to DreamWorks . With the help of executives Glenn Williamson and Bob Cooper , and Steven Spielberg in his capacity as studio partner , Ball was convinced to develop the project at DreamWorks ; he received assurances from the studio — known at the time for its more conventional fare — that it would not " iron the [ edges ] out " . In an unusual move , DreamWorks decided not to option the script ; instead , in April 1998 , the studio bought it outright for $ 250 @,@ 000 , outbidding Fox Searchlight Pictures , October Films , Metro @-@ Goldwyn @-@ Mayer , and Lakeshore Entertainment . DreamWorks planned to make the film for $ 6 – 8 million . 
- Jinks and Cohen involved Ball throughout the film 's development , including casting and director selection . The producers met with about 20 interested directors , several of whom were considered " A @-@ list " at the time . Ball was not keen on the more well @-@ known directors because he believed their involvement would increase the budget and lead DreamWorks to become " nervous about the content " . Nevertheless , the studio offered the film to Mike Nichols and Robert Zemeckis ; neither accepted . In the same year , Mendes ( then a theater director ) revived the musical Cabaret in New York with fellow director Rob Marshall . Beth Swofford of the Creative Artists Agency arranged meetings for Mendes with studio figures in Los Angeles to see if film direction was a possibility . Mendes came across American Beauty in a pile of eight scripts at Swofford 's house , and knew immediately that it was the one he wanted to make ; early in his career , he had been inspired by how the film Paris , Texas ( 1984 ) presented contemporary America as a mythic landscape and he saw the same theme in American Beauty , as well as parallels with his own childhood . Mendes later met with Spielberg ; impressed by Mendes ' productions of Oliver ! and Cabaret , Spielberg encouraged him to consider American Beauty . 
- Mendes found that he still had to convince DreamWorks ' production executives to let him direct . He had already discussed the film with Jinks and Cohen , and felt they supported him . Ball was also keen ; having seen Cabaret , he was impressed with Mendes ' " keen visual sense " and thought he did not make obvious choices . Ball felt that Mendes liked to look under the story 's surface , a talent he felt would be a good fit with the themes of American Beauty . Mendes ' background also reassured him , because of the prominent role the playwright usually has in a theater production . Over two meetings — the first with Cooper , Walter Parkes , and Laurie MacDonald , the second with Cooper alone — Mendes pitched himself to the studio . The studio soon approached Mendes with a deal to direct for the minimum salary allowed under Directors Guild of America rules — $ 150 @,@ 000 . Mendes accepted , and later recalled that after taxes and his agent 's commission , he only earned $ 38 @,@ 000 . In June 1998 , DreamWorks confirmed that it had contracted Mendes to direct the film . 
- 
- = = = Writing = = = 
- 
- Ball was partly inspired by two encounters he had in the early 1990s . In about 1991 – 92 , Ball saw a plastic bag blowing in the wind outside the World Trade Center . He watched the bag for 10 minutes , saying later that it provoked an " unexpected emotional response " . In 1992 , Ball became preoccupied with the media circus around the Amy Fisher trial . Discovering a comic book telling of the scandal , he was struck by how quickly it had become commercialized . He said he " felt like there was a real story underneath [ that was ] more fascinating and way more tragic " than the story presented to the public , and attempted to turn the idea into a play . Ball produced around 40 pages , but stopped when he realized it would work better as a film . He felt that because of the visual themes , and because each character 's story was .. " intensely personal " , it could not be done on a stage . All the main characters appeared in this version , but Carolyn did not feature strongly ; Jim and Jim instead had much larger roles . 
- Ball based Lester 's story on aspects of his own life . Lester 's re @-@ examination of his life parallels feelings Ball had in his mid @-@ 30s ; like Lester , Ball put aside his passions to work in jobs he hated for people he did not respect . Scenes in Ricky 's household reflect Ball 's own childhood experiences . Ball suspected his father was homosexual and used the idea to create Col. Fitts , a man who " gave up his chance to be himself " . Ball said the script 's mix of comedy and drama was not intentional , but that it came unconsciously from his own outlook on life . He said the juxtaposition produced a starker contrast , giving each trait more impact than if they appeared alone . 
- In the script that was sent to prospective actors and directors , Lester and Angela had sex ; by the time of shooting , Ball had rewritten the scene to the final version . Ball initially rebuffed counsel from others that he change the script , feeling they were being puritanical ; the final impetus to alter the scene came from DreamWorks ' then @-@ president Walter Parkes . He convinced Ball by indicating that in Greek mythology , the hero " has a moment of epiphany before [ ... ] tragedy occurs " . Ball later said his anger when writing the first draft had blinded him to the idea that Lester needed to refuse sex with Angela to complete his emotional journey — to achieve redemption . Jinks and Cohen asked Ball not to alter the scene right away , as they felt it would be inappropriate to make changes to the script before a director had been hired . Early drafts also included a flashback to Col. Fitts ' service in the Marines , a sequence that unequivocally established his homosexual leanings . In love with another Marine , Col. Fitts sees the man die and comes to believe that he is being punished for the " sin " of being gay . Ball removed the sequence because it did not fit the structure of the rest of the film — Col. Fitts was the only character to have a flashback — and because it removed the element of surprise from Col. Fitts ' later pass at Lester . Ball said he had to write it for his own benefit to know what happened to Col. Fitts , though all that remained in later drafts was subtext . 
- Ball remained involved throughout production ; he had signed a television show development deal , so had to get permission from his producers to take a year off to be close to American Beauty . Ball was on @-@ set for rewrites and to help interpret his script for all but two days of filming . His original bookend scenes — in which Ricky and Jane are prosecuted for Lester 's murder after being framed by Col. Fitts — were excised in post @-@ production ; the writer later felt the scenes were unnecessary , saying they were a reflection of his " anger and cynicism " at the time of writing ( see " Editing " ) . Ball and Mendes revised the script twice before it was sent to the actors , and twice more before the first read @-@ through . 
- The shooting script features a scene in Angela 's car in which Ricky and Jane talk about death and beauty ; the scene differed from earlier versions , which set it as a " big scene on a freeway " in which the three witness a car crash and see a dead body . The change was a practical decision , as the production was behind schedule and they needed to cut costs . The schedule called for two days to be spent filming the crash , but only half a day was available . Ball agreed , but only if the scene could retain a line of Ricky 's where he reflects on having once seen a dead homeless woman : " When you see something like that , it 's like God is looking right at you , just for a second . And if you 're careful , you can look right back . " Jane asks : " And what do you see ? " Ricky : " Beauty . " Ball said , " They wanted to cut that scene . They said it 's not important . I said , ' You 're out of your fucking mind . It 's one of the most important scenes in the movie ! ' [ ... ] If any one line is the heart and soul of this movie , that is the line . " Another scene was rewritten to accommodate the loss of the freeway sequence ; set in a schoolyard , it presents a " turning point " for Jane in that she chooses to walk home with Ricky instead of going with Angela . By the end of filming , the script had been through 10 drafts . 
- 
- = = = Casting = = = 
- 
- Mendes had Spacey and Bening in mind for the leads from the beginning , but DreamWorks executives were unenthusiastic . The studio suggested several alternatives , including Bruce Willis , Kevin Costner , or John Travolta to play Lester , and Helen Hunt or Holly Hunter to play Carolyn . Mendes did not want a big star " weighing the film down " ; he felt Spacey was the right choice based on his performances in the 1995 films The Usual Suspects and Seven , and 1992 's Glengarry Glen Ross . Spacey was surprised ; he said , " I usually play characters who are very quick , very manipulative and smart . [ ... ] I usually wade in dark , sort of treacherous waters . This is a man living one step at a time , playing by his instincts . This is actually much closer to me , to what I am , than those other parts . " Mendes offered Bening the role of Carolyn without the studio 's consent ; although executives were upset at Mendes , by September 1998 , DreamWorks had entered negotiations with Spacey and Bening . 
- Spacey loosely based Lester 's early " schlubby " deportment on Walter Matthau . During the film , Lester 's physique improves from flabby to toned ; Spacey worked out during filming to improve his body , but because Mendes shot the scenes out of chronological order , Spacey varied postures to portray the stages . Before filming , Mendes and Spacey analyzed Jack Lemmon 's performance in The Apartment ( 1960 ) , because Mendes wanted Spacey to emulate " the way [ Lemmon ] moved , the way he looked , the way he was in that office and the way he was an ordinary man and yet a special man " . Spacey 's voiceover is a throwback to Sunset Boulevard ( 1950 ) , which is also narrated in retrospect by a dead character . Mendes felt it evoked Lester 's — and the film 's — loneliness . Bening recalled women from her youth to inform her performance : " I used to babysit constantly . You 'd go to church and see how people present themselves on the outside , and then be inside their house and see the difference . " Bening and a hair stylist collaborated to create a " PTA president coif " hairstyle , and Mendes and production designer Naomi Shohan researched mail @-@ order catalogs to better establish Carolyn 's environment of a " spotless suburban manor " . To help Bening get into Carolyn 's mindset , Mendes gave her music that he believed Carolyn would like . He lent Bening the Bobby Darin version of the song " Don 't Rain on My Parade " , which she enjoyed and persuaded the director to include it for a scene in which Carolyn sings in her car . 
- For the roles of Jane , Ricky , and Angela , DreamWorks gave Mendes carte blanche . By November 1998 , Thora Birch , Wes Bentley , and Mena Suvari had been cast in the parts — in Birch 's case , despite the fact she was underage for her nude scene . As Birch was 16 at the time she made the film , and thus classified as a minor in the United States , her parents had to approve her brief topless scene in the movie . Child labor representatives and they were on the set for the shooting of the scene . Bentley overcame competition from top actors under the age of 25 to be cast . The 2009 documentary My Big Break followed Bentley , and several other young actors , before and after he landed the part . To prepare , Mendes provided Bentley with a video camera , telling the actor to film what Ricky would . Peter Gallagher and Alison Janney were cast ( as Buddy Kane and Barbara Fitts ) after filming began in December 1998 . Mendes gave Janney a book of paintings by Edvard Munch . He told her , " Your character is in there somewhere . " Mendes cut much of Barbara 's dialogue , including conversations between Colonel Fitts and her , as he felt that what needed to be said about the pair — their humanity and vulnerability — was conveyed successfully through their shared moments of silence . Chris Cooper plays Colonel Fitts , Scott Bakula plays Jim Olmeyer , and Sam Robards plays Jim Berkley . Jim and Jim were deliberately depicted as the most normal , happy — and boring — couple in the film . Ball 's inspiration for the characters came from a thought he had after seeing a " bland , boring , heterosexual couple " who wore matching clothes : " I can 't wait for the time when a gay couple can be just as boring . " Ball also included aspects of a gay couple he knew who had the same forename . 
- Mendes insisted on two weeks of cast rehearsals , although the sessions were not as formal as he was used to in the theater , and the actors could not be present at every one . Several improvisations and suggestions by the actors were incorporated into the script . An early scene showing the Burnhams leaving home for work was inserted later on to show the low point that Carolyn and Lester 's relationship had reached . Spacey and Bening worked to create a sense of the love that Lester and Carolyn once had for one another ; for example , the scene in which Lester almost seduces Carolyn after the pair argues over Lester 's buying a car was originally " strictly contentious " . 
- 
- = = = Filming = = = 
- 
- Principal photography lasted about 50 days from December 14 , 1998 , to February 1999 . American Beauty was filmed on soundstages at the Warner Bros. backlot in Burbank , California , and at Hancock Park and Brentwood in Los Angeles . The aerial shots at the beginning and end of the film were captured in Sacramento , California , and many of the school scenes were shot at South High School in Torrance , California ; several extras in the gym crowd were South High students . The film is set in an upper middle @-@ class neighborhood in an unidentified American town . Production designer Naomi Shohan likened the locale to Evanston , Illinois , but said , " it 's not about a place , it 's about an archetype . [ ... ] The milieu was pretty much Anywhere , USA — upwardly mobile suburbia . " The intent was for the setting to reflect the characters , who are also archetypes . Shohan said , " All of them are very strained , and their lives are constructs . " The Burnhams ' household was designed as the reverse of the Fitts ' — the former a pristine ideal , but graceless and lacking in " inner balance " , leading to Carolyn 's desire to at least give it the appearance of a " perfect all @-@ American household " ; the Fitts ' home is depicted in " exaggerated darkness [ and ] symmetry " . 
- The production selected two adjacent properties on the Warner backlot 's " Blondie Street " for the Burnham and Fitts ' homes . The crew rebuilt the houses to incorporate false rooms that established lines of sight — between Ricky and Jane 's bedroom windows , and between Ricky 's bedroom and Lester 's garage . The garage windows were designed specifically to obtain the crucial shot toward the end of the film in which Col. Fitts — watching from Ricky 's bedroom — mistakenly assumes that Lester is paying Ricky for sex . Mendes made sure to establish the line of sight early on in the film to make the audience feel a sense of familiarity with the shot . The house interiors were filmed on the backlot , on location , and on soundstages when overhead shots were needed . The inside of the Burnhams ' home was shot at a house close to Interstate 405 and Sunset Boulevard in Los Angeles ; the inside of the Fitts ' home was shot in the city 's Hancock Park neighborhood . Ricky 's bedroom was designed to be cell @-@ like to suggest his " monkish " personality , while at the same time blending with the high @-@ tech equipment to reflect his voyeuristic side . The production deliberately minimized the use of red , as it was an important thematic signature elsewhere . The Burnhams ' home uses cool blues , while the Fitts ' is kept in a " depressed military palette " . 
- Mendes ' dominating visual style was deliberate and composed , with a minimalist design that provided " a sparse , almost surreal feeling — a bright , crisp , hard edged , near Magritte @-@ like take on American suburbia " ; Mendes constantly directed his set dressers to empty the frame . He made Lester 's fantasy scenes " more fluid and graceful " , and Mendes made minimal use of steadicams , feeling that stable shots generated more tension . For example , when Mendes used a slow push in to the Burnhams ' dinner table , he held the shot because his training as a theater director taught him the importance of putting distance between the characters . He wanted to keep the tension in the scene , so he only cut away when Jane left the table . Mendes did use a hand @-@ held camera for the scene in which Col. Fitts beats Ricky . Mendes said the camera provided the scene with a " kinetic [ ... ] off @-@ balance energy " . He also went hand @-@ held for the excerpts of Ricky 's camcorder footage . Mendes took a long time to get the quality of Ricky 's footage to the level he wanted . For the plastic @-@ bag footage , Mendes used wind machines to move the bag in the air . The scene took four takes ; two by the second unit did not satisfy Mendes , so he shot the scene himself . He felt his first take lacked grace , but for the last attempt , he changed the location to the front of a brick wall and added leaves on the ground . Mendes was satisfied by the way the wall gave definition to the outline of the bag . 
- Mendes avoided using close @-@ ups , as he believed the technique was overused ; he also cited Spielberg 's advice that he should imagine an audience silhouetted at the bottom of the camera monitor , to keep in mind that he was shooting for display on a 40 @-@ foot ( 10 m ) screen . Spielberg — who visited the set a few times — also advised Mendes not to worry about costs if he had a " great idea " toward the end of a long working day . Mendes said , " That happened three or four times , and they are all in the movie . " Despite Spielberg 's support , DreamWorks and Mendes fought constantly over the schedule and budget , although the studio interfered little with the film 's content . Spacey , Bening and Hall worked for significantly less than their usual rates . American Beauty cost DreamWorks $ 15 million to produce , slightly above their projected sum . Mendes was so dissatisfied with his first three days ' filming that he obtained permission from DreamWorks to reshoot the scenes . He said , " I started with a wrong scene , actually , a comedy scene . And the actors played it way too big : [ ... ] it was badly shot , my fault , badly composed , my fault , bad costumes , my fault [ ... ] ; and everybody was doing what I was asking . It was all my fault . " Aware that he was a novice , Mendes drew on the experience of Hall : " I made a very conscious decision early on , if I didn 't understand something technically , to say , without embarrassment , ' I don 't understand what you 're talking about , please explain it . ' " 
- Mendes encouraged some improvisation ; for example , when Lester masturbates in bed beside Carolyn , the director asked Spacey to improvise several euphemisms for the act in each take . Mendes said , " I wanted that not just because it was funny [ ... ] but because I didn 't want it to seem rehearsed . I wanted it to seem like he was blurting it out of his mouth without thinking . [ Spacey ] is so in control — I wanted him to break through . " Spacey obliged , eventually coming up with 35 phrases , but Bening could not always keep a straight face , which meant the scene had to be shot 10 times . The production used small amounts of computer @-@ generated imagery . Most of the rose petals in Lester 's fantasies were added in post @-@ production , although some were real and had the wires holding them digitally removed . When Lester fantasizes about Angela in a rose @-@ petal bath , the steam was real , save for in the overhead shot . To position the camera , a hole had to be cut in the ceiling , through which the steam escaped ; it was instead added digitally . 
- 
- = = = Editing = = = 
- 
- American Beauty was edited by Christopher Greenbury and Tariq Anwar ; Greenbury began in the position , but had to leave halfway through post @-@ production because of a scheduling conflict with Me , Myself and Irene ( 2000 ) ( in which Chris Cooper also starred ) . Mendes and an assistant edited the film for 10 days between the appointments . Mendes realized during editing that the film was different from the one he had envisioned . He believed he had been making a " much more whimsical , [ ... ] kaleidoscopic " film than what came together in the edit suite . Instead , Mendes was drawn to the emotion and darkness ; he began to use the score and shots he had intended to discard to craft the film along these lines . In total , he cut about 30 minutes from his original edit . The opening included a dream in which Lester imagines himself flying above the town . Mendes spent two days filming Spacey against bluescreen , but removed the sequence as he believed it to be too whimsical — " like a Coen brothers movie " — and therefore inappropriate for the tone he was trying to set . The opening in the final cut reused a scene from the middle of the film where Jane tells Ricky to kill her father . This scene was to be the revelation to the audience that the pair was not responsible for Lester 's death , as the way it was scored and acted made it clear that Jane 's request was not serious . However , in the portion he used in the opening — and when the full scene plays out later — Mendes used the score and a reaction shot of Ricky to leave a lingering ambiguity as to his guilt . The subsequent shot — an aerial view of the neighborhood — was originally intended as the plate shot for the bluescreen effects in the dream sequence . 
- Mendes spent more time recutting the first 10 minutes than the rest of the film taken together . He trialled several versions of the opening ; the first edit included bookend scenes in which Jane and Ricky are convicted of Lester 's murder , but Mendes excised these in the last week of editing because he felt they made the film lose its mystery , and because they did not fit with the theme of redemption that had emerged during production . Mendes believed the trial drew focus away from the characters and turned the film " into an episode of NYPD Blue " . Instead , he wanted the ending to be " a poetic mixture of dream and memory and narrative resolution " . When Ball first saw a completed edit , it was a version with truncated versions of these scenes . He felt that they were so short that they " didn 't really register " . Mendes and he argued , but Ball was more accepting after Mendes cut the sequences completely ; Ball felt that without the scenes , the film was more optimistic and had evolved into something that " for all its darkness had a really romantic heart " . 
- 
- = = = Cinematography = = = 
- 
- Conrad Hall was not the first choice for director of photography ; Mendes believed he was " too old and too experienced " to want the job , and he had been told that Hall was difficult to work with . Instead , Mendes asked Fred Elmes , who turned the job down because he did not like the script . Hall was recommended to Mendes by Tom Cruise , because of Hall 's work on Without Limits ( 1998 ) , which Cruise had executive produced . Mendes was directing Cruise 's then @-@ wife Nicole Kidman in the play The Blue Room during preproduction on American Beauty , and had already storyboarded the whole film . Hall was involved for one month during preproduction ; his ideas for lighting the film began with his first reading of the script , and further passes allowed him to refine his approach before meeting Mendes . Hall was initially concerned that audiences would not like the characters ; he only felt able to identify with them during cast rehearsals , which gave him fresh ideas on his approach to the visuals . 
- Hall 's approach was to create peaceful compositions that evoked classicism , to contrast with the turbulent on @-@ screen events and allow audiences to take in the action . Hall and Mendes first discussed the intended mood of a scene , but he was allowed to light the shot in any way he felt necessary . In most cases , Hall first lit the scene 's subject by " painting in " the blacks and whites , before adding fill light , which he reflected from beadboard or white card on the ceiling . This approach gave Hall more control over the shadows while keeping the fill light unobtrusive and the dark areas free of spill . Hall shot American Beauty in a 2 @.@ 39 : 1 aspect ratio in the Super 35 format , using Kodak Vision 500T 5279 35 mm film stock . He used Super 35 partly because its larger scope allowed him to capture elements such as the corners of the petal @-@ filled pool in its overhead shot , creating a frame around Angela within . He shot the whole film at the same T @-@ stop ( T1.9 ) ; given his preference for shooting that wide , Hall favored high @-@ speed stocks to allow for more subtle lighting effects . He used Panavision Platinum cameras with the company 's Primo series of prime and zoom lenses . Hall employed Kodak Vision 200T 5274 and EXR 5248 stock for scenes with daylight effects . He had difficulty adjusting to Kodak 's newly introduced Vision release print stock , which , combined with his contrast @-@ heavy lighting style , created a look with too much contrast . Hall contacted Kodak , who sent him a batch of 5279 that was 5 % lower in contrast . Hall used a 1 / 8 inch Tiffen Black ProMist filter for almost every scene , which he said in retrospect may not have been the best choice , as the optical steps required to blow Super 35 up for its anamorphic release print led to a slight amount of degradation ; therefore , the diffusion from the filter was not required . When he saw the film in a theater , Hall felt that the image was slightly unclear and that had he not used the filter , the diffusion from the Super 35 – anamorphic conversion would have generated an image closer to what he originally intended . 
- A shot where Lester and Ricky share a cannabis joint behind a building came from a misunderstanding between Hall and Mendes . Mendes asked Hall to prepare the shot in his absence ; Hall assumed the characters would look for privacy , so he placed them in a narrow passage between a truck and the building , intending to light from the top of the truck . When Mendes returned , he explained that the characters did not care if they were seen . He removed the truck and Hall had to rethink the lighting ; he lit it from the left , with a large light crossing the actors , and with a soft light behind the camera . Hall felt the consequent wide shot " worked perfectly for the tone of the scene " . Hall made sure to keep rain , or the suggestion of it , in every shot near the end of the film . In one shot during Lester 's encounter with Angela at the Burnhams ' home , Hall created rain effects on the foreground cross lights ; in another , he partly lit the pair through French windows to which he had added material to make the rain run slower , intensifying the light ( although the strength of the outside light was unrealistic for a night scene , Hall felt it justified because of the strong contrasts it produced ) . For the close @-@ ups when Lester and Angela move to the couch , Hall tried to keep rain in the frame , lighting through the window onto the ceiling behind Lester . He also used rain boxes to produce rain patterns where he wanted without lighting the entire room . 
- 
- = = = Music = = = 
- 
- Thomas Newman 's score was recorded in Santa Monica , California . He mainly used percussion instruments to create the mood and rhythm , the inspiration for which was provided by Mendes . Newman " favored pulse , rhythm , and color over melody " , making for a more minimalist score than he had previously created . He built each cue around " small , endlessly repeating phrases " — often , the only variety through a " thinning of the texture for eight bars " . The percussion instruments included tablas , bongos , cymbals , piano , xylophones , and marimbas ; also featured were guitars , flute , and world music instruments . Newman also used electronic music and on " quirkier " tracks employed more unorthodox methods , such as tapping metal mixing bowls with a finger and using a detuned mandolin . Newman believed the score helped move the film along without disturbing the " moral ambiguity " of the script : " It was a real delicate balancing act in terms of what music worked to preserve [ that ] . " 
- The soundtrack features songs by Newman , Bobby Darin , The Who , Free , Eels , The Guess Who , Bill Withers , Betty Carter , Peggy Lee , The Folk Implosion , Gomez , and Bob Dylan , as well as two cover versions — The Beatles ' " Because " performed by Elliott Smith , and Neil Young 's " Don 't Let It Bring You Down " performed by Annie Lennox . Produced by the film 's music supervisor Chris Douridas , an abridged soundtrack album was released on October 5 , 1999 , and went on to be nominated for a Grammy Award for Best Soundtrack Album . An album featuring 19 tracks from Newman 's score was released on January 11 , 2000 , and won the Grammy Award for Best Score Soundtrack Album . Filmmaker considered the score to be one of Newman 's best , saying it " [ enabled ] the film 's transcendentalist aspirations " . In 2006 , the magazine chose the score as one of 20 essential soundtracks it believed spoke to the " complex and innovative relationships between music and screen storytelling " . 
- 
- = = Release = = 
- 
- 
- = = = Publicity = = = 
- 
- DreamWorks contracted Amazon.com to create the official website , marking the first time that Amazon had created a special section devoted to a feature film . The website included an overview , a photo gallery , cast and crew filmographies , and exclusive interviews with Spacey and Bening . The film 's tagline — " look closer " — originally came from a cutting pasted on Lester 's workplace cubicle by the set dresser . DreamWorks ran parallel marketing campaigns and trailers — one aimed at adults , the other at teenagers . Both trailers ended with the poster image of a girl holding a rose . Reviewing the posters of several 1999 films , David Hochman of Entertainment Weekly rated American Beauty 's highly , saying it evoked the tagline ; he said , " You return to the poster again and again , thinking , this time you 're gonna find something . " DreamWorks did not want to test screen the film ; according to Mendes , the studio was pleased with it , but he insisted on one where he could question the audience afterward . The studio reluctantly agreed and showed the film to a young audience in San Jose , California . Mendes claimed the screening went very well . 
- 
- = = = Theatrical run = = = 
- 
- The film had its world premiere on September 8 , 1999 , at Grauman 's Egyptian Theatre in Los Angeles . Three days later , the film appeared at the Toronto International Film Festival . With the filmmakers and cast in attendance , it screened at several American universities , including the University of California at Berkeley , New York University , the University of California at Los Angeles , the University of Texas at Austin , and Northwestern University . 
- On September 15 , 1999 , American Beauty opened to the public in limited release at three theaters in Los Angeles and three in New York . More theaters were added during the limited run , and on October 1 , the film officially entered wide release by screening in 706 theaters across North America . The film grossed $ 8 @,@ 188 @,@ 587 over the weekend , ranking third at the box office . Audiences polled by the market research firm CinemaScore gave American Beauty a " B + " grade on average . The theater count hit a high of 1 @,@ 528 at the end of the month , before a gradual decline . Following American Beauty 's wins at the 57th Golden Globe Awards , DreamWorks re @-@ expanded the theater presence from a low of 7 in mid @-@ February , to a high of 1 @,@ 990 in March . The film ended its North American theatrical run on June 4 , 2000 , having grossed $ 130 @.@ 1 million . 
- American Beauty had its European premiere at the London Film Festival on November 18 , 1999 ; in January 2000 , it began to screen in various territories outside North America . It debuted in Israel to " potent " returns , and limited releases in Germany , Italy , Austria , Switzerland , the Netherlands and Finland followed on January 21 . After January 28 opening weekends in Australia , the United Kingdom , Spain and Norway , American Beauty had earned $ 7 million in 12 countries for a total of $ 12 @.@ 1 million outside North America . On February 4 , American Beauty debuted in France and Belgium . Expanding to 303 theaters in the United Kingdom , the film ranked first at the box office with $ 1 @.@ 7 million . On the weekend of February 18 — following American Beauty 's eight nominations for the 72nd Academy Awards — the film grossed $ 11 @.@ 7 million from 21 territories , for a total of $ 65 @.@ 4 million outside North America . The film had " dazzling " debuts in Hungary , Denmark , the Czech Republic , Slovakia , and New Zealand . 
- As of February 18 , the most successful territories were the United Kingdom ( $ 15 @.@ 2 million ) , Italy ( $ 10 @.@ 8 million ) , Germany ( $ 10 @.@ 5 million ) , Australia ( $ 6 million ) , and France ( $ 5 @.@ 3 million ) . The Academy Award nominations meant strong performances continued across the board ; the following weekend , American Beauty grossed $ 10 @.@ 9 million in 27 countries , with strong debuts in Brazil , Mexico , and South Korea . Other high spots included robust returns in Argentina , Greece , and Turkey . On the weekend of March 3 , 2000 , American Beauty debuted strongly in Hong Kong , Taiwan , and Singapore , markets traditionally " not receptive to this kind of upscale fare " . The impressive South Korean performance continued , with a return of $ 1 @.@ 2 million after nine days . In total , American Beauty grossed $ 130 @.@ 1 million in North America and $ 226 @.@ 2 million internationally , for $ 356 @.@ 3 million worldwide . 
- 
- = = = Home media = = = 
- 
- American Beauty was released on VHS on May 9 , 2000 , and on DVD with the DTS format on October 24 , 2000 . Before the North American rental release on May 9 , Blockbuster Video wanted to purchase hundreds of thousands of extra copies for its " guaranteed title " range , whereby anyone who wanted to rent the film would be guaranteed a copy . Blockbuster and DreamWorks could not agree on a profit @-@ sharing deal , so Blockbuster ordered two @-@ thirds the number of copies it originally intended . DreamWorks made around one million copies available for rental ; Blockbuster 's share would usually have been about 400 @,@ 000 of these . Some Blockbuster stores only displayed 60 copies , and others did not display the film at all , forcing customers to ask for it . The strategy required staff to read a statement to customers explaining the situation ; Blockbuster claimed it was only " [ monitoring ] customer demand " due to the reduced availability . Blockbuster 's strategy leaked before May 9 , leading to a 30 % order increase from other retailers . In its first week of rental release , American Beauty made $ 6 @.@ 8 million . This return was lower than would have been expected had DreamWorks and Blockbuster reached an agreement . In the same year , The Sixth Sense made $ 22 million , while Fight Club made $ 8 @.@ 1 million , though the latter 's North American theatrical performance was just 29 % that of American Beauty . Blockbuster 's strategy also affected rental fees ; American Beauty averaged $ 3 @.@ 12 , compared with $ 3 @.@ 40 for films that Blockbuster fully promoted . Only 53 % of the film 's rentals were from large outlets in the first week , compared with the usual 65 % . 
- The DVD release included a behind @-@ the @-@ scenes featurette , film audio commentary from Mendes and Ball , and a storyboard presentation with discussion from Mendes and Hall . In the film commentary , Mendes refers to deleted scenes he intended to include in the release . However , these scenes are not on the DVD , as he changed his mind after recording the commentary ; Mendes felt that to show scenes he previously chose not to use would detract from the film 's integrity . 
- On September 21 , 2010 , Paramount Home Entertainment released American Beauty on Blu @-@ ray , as part of Paramount 's Sapphire Series . All the extras from the DVD release were present , with the theatrical trailers upgraded to HD . 
- 
- = = Critical reception = = 
- 
- American Beauty was widely considered the best film of 1999 by the American press . It received overwhelming praise , chiefly for Spacey , Mendes and Ball . Variety reported that " no other 1999 movie has benefited from such universal raves . " It was the best @-@ received title at the Toronto International Film Festival ( TIFF ) , where it won the People 's Choice award after a ballot of the festival 's audiences . TIFF 's director , Piers Handling , said , " American Beauty was the buzz of the festival , the film most talked @-@ about . " 
- Writing in Variety , Todd McCarthy said the cast ensemble " could not be better " ; he praised Spacey 's " handling of innuendo , subtle sarcasm , and blunt talk " and the way he imbued Lester with " genuine feeling " . Janet Maslin in The New York Times said Spacey was at his " wittiest and most agile " to date , and Roger Ebert of the Chicago Sun @-@ Times singled Spacey out for successfully portraying a man who " does reckless and foolish things [ but who ] doesn 't deceive himself " . Kevin Jackson of Sight & Sound said Spacey impressed in ways distinct from his previous performances , the most satisfying aspect being his portrayal of " both sap and hero " . Writing in Film Quarterly , Gary Hentzi praised the actors , but said that characters such as Carolyn and Col. Fitts were stereotypes . Hentzi accused Mendes and Ball of identifying too readily with Jane and Ricky , saying the latter was their " fantasy figure " — a teenaged boy who 's an absurdly wealthy artist able to " finance [ his ] own projects " . Hentzi said Angela was the most believable teenager , in particular with her " painfully familiar " attempts to " live up to an unworthy image of herself " . Maslin agreed that some characters were unoriginal , but said their detailed characterizations made them memorable . Kenneth Turan of the Los Angeles Times said the actors coped " faultlessly " with what were difficult roles ; he called Spacey 's performance " the energy that drives the film " , saying the actor commanded audience involvement despite Lester not always being sympathetic . " Against considerable odds , we do like [ these characters ] , " Turan concluded . 
- Maslin felt that Mendes directed with " terrific visual flair " , saying his minimalist style balanced " the mordant and bright " and that he evoked the " delicate , eroticized power @-@ playing vignettes " of his theater work . Jackson said Mendes ' theatrical roots rarely showed , and that the " most remarkable " aspect was that Spacey 's performance did not overshadow the film . He said that Mendes worked the script 's intricacies smoothly , to the ensemble 's strengths , and staged the tonal shifts skillfully . McCarthy believed American Beauty a " stunning card of introduction " for film débutantes Mendes and Ball . He said Mendes ' " sure hand " was " as precise and controlled " as his theater work . McCarthy cited Hall 's involvement as fortunate for Mendes , as the cinematographer was " unsurpassed " at conveying the themes of a work . Turan agreed that Mendes ' choice of collaborators was " shrewd " , naming Hall and Newman in particular . Turan suggested that American Beauty may have benefited from Mendes ' inexperience , as his " anything 's possible daring " made him attempt beats that more seasoned directors might have avoided . Turan felt that Mendes ' accomplishment was to " capture and enhance [ the ] duality " of Ball 's script — the simultaneously " caricatured [ ... ] and painfully real " characters . Hentzi , while critical of many of Mendes and Ball 's choices , admitted the film showed off their " considerable talents " . 
- Turan cited Ball 's lack of constraint when writing the film as the reason for its uniqueness , in particular the script 's subtle changes in tone . McCarthy said the script was " as fresh and distinctive " as any of its American film contemporaries , and praised how it analyzed the characters while not compromising narrative pace . He called Ball 's dialogue " tart " and said the characters — Carolyn excepted — were " deeply drawn " . One other flaw , McCarthy said , was the revelation of Col. Fitts ' homosexuality , which he said evoked " hoary Freudianism " . Jackson said the film transcended its clichéd setup to become a " wonderfully resourceful and sombre comedy " . He said that even when the film played for sitcom laughs , it did so with " unexpected nuance " . Hentzi criticized how the film made a mystery of Lester 's murder , believing it manipulative and simply a way of generating suspense . McCarthy cited the production and costume design as pluses , and said the soundtrack was good at creating " ironic counterpoint [ s ] " to the story . Hentzi concluded that American Beauty was " vital but uneven " ; he felt the film 's examination of " the ways which teenagers and adults imagine each other 's lives " was its best point , and that although Lester and Angela 's dynamic was familiar , its romantic irony stood beside " the most enduring literary treatments " of the theme , such as Lolita . Nevertheless , Hentzi believed that the film 's themes of materialism and conformity in American suburbia were " hackneyed " . McCarthy conceded that the setting was familiar , but said it merely provided the film with a " starting point " from which to tell its " subtle and acutely judged tale " . Maslin agreed ; she said that while it " takes aim at targets that are none too fresh " , and that the theme of nonconformity did not surprise , the film had its own " corrosive novelty " . Ebert awarded American Beauty four stars out of four , and Turan said it was layered , subversive , complex , and surprising , concluding it was " a hell of a picture " . 
- A few months after the film 's release , reports of a backlash appeared in the American press , and the years since have seen its critical regard wane . In 2005 , Premiere named American Beauty as one of 20 " most overrated movies of all time " ; Mendes accepted the inevitability of the critical reappraisal , saying , " I thought some of it was entirely justified — it was a little overpraised at the time . " 
- Currently , the film holds an 88 % score on Rotten Tomatoes based on 180 reviews , with an average rating of 8 @.@ 2 / 10 ; the critical consensus reads , " Flawlessly cast and brimming with dark , acid wit , American Beauty is a smart , provocative high point of late ' 90s mainstream Hollywood film . " Metacritic gives the film a score of 86 , based on 33 reviews , indicating " universal acclaim . " 
- 
- = = Accolades = = 
- 
- American Beauty was not considered an immediate favorite to dominate the American awards season . Several other contenders opened at the end of 1999 , and US critics spread their honors among them when compiling their end @-@ of @-@ year lists . The Chicago Film Critics Association and the Broadcast Film Critics Association named the film the best of 1999 , but while the New York Film Critics Circle , the National Society of Film Critics and the Los Angeles Film Critics Association recognized American Beauty , they gave their top awards to other films . By the end of the year , reports of a critical backlash suggested American Beauty was the underdog in the race for Best Picture ; however , at the Golden Globe Awards in January 2000 , American Beauty won Best Film , Best Director and Best Screenplay . 
- As the nominations for the 72nd Academy Awards approached , a frontrunner had not emerged . DreamWorks had launched a major campaign for American Beauty five weeks before ballots were due to be sent to the 5 @,@ 600 Academy Award voters . Its campaign combined traditional advertising and publicity with more focused strategies . Although direct mail campaigning was prohibited , DreamWorks reached voters by promoting the film in " casual , comfortable settings " in voters ' communities . The studio 's candidate for Best Picture the previous year , Saving Private Ryan , lost to Shakespeare in Love , so the studio took a new approach by hiring outsiders to provide input for the campaign . It hired three veteran consultants , who told the studio to " think small " . Nancy Willen encouraged DreamWorks to produce a special about the making of American Beauty , to set up displays of the film in the communities ' bookstores , and to arrange a question @-@ and @-@ answer session with Mendes for the British Academy of Film and Television Arts . Dale Olson advised the studio to advertise in free publications that circulated in Beverly Hills — home to many voters — in addition to major newspapers . Olson arranged to screen American Beauty to about 1 @,@ 000 members of the Actors Fund of America , as many participating actors were also voters . Bruce Feldman took Ball to the Santa Barbara International Film Festival , where Ball attended a private dinner in honor of Anthony Hopkins , meeting several voters who were in attendance . 
- In February 2000 , American Beauty was nominated for eight Academy Awards ; its closest rivals , The Cider House Rules and The Insider , received seven nominations each . In March 2000 , the major industry labor organizations all awarded their top honors to American Beauty ; perceptions had shifted — the film was now the favorite to dominate the Academy Awards . American Beauty 's closest rival for Best Picture was still The Cider House Rules , from Miramax . Both studios mounted aggressive campaigns ; DreamWorks bought 38 % more advertising space in Variety than Miramax . On March 26 , 2000 , American Beauty won five Academy Awards : Best Picture , Best Director , Best Actor ( Spacey ) , Best Original Screenplay and Best Cinematography . At the 53rd British Academy Film Awards , American Beauty won six of the 14 awards for which it was nominated : Best Film , Best Actor , Best Actress ( Bening ) , Best Cinematography , Best Film Music and Best Editing . In 2000 , the Publicists Guild of America recognized DreamWorks for the best film publicity campaign . In September 2008 , Empire named American Beauty the 96th " Greatest Movie of All Time " after a poll of 10 @,@ 000 readers , 150 filmmakers , and 50 film critics , the fourth @-@ highest ranked movie from 1999 ( behind Fight Club , The Matrix , and Magnolia ) . In 2013 , the Writers Guild of America ranked the screenplay number 38 on its list of 101 greatest screenplays . 
- The film was nominated for AFI 's 100 Years ... 100 Movies ( 10th Anniversary Edition ) in 2007 . 
- 
- 
- = Christopher Gore = 
- 
- Christopher Gore ( September 21 , 1758 – March 1 , 1827 ) was a prominent Massachusetts lawyer , Federalist politician , and U.S. diplomat . Born into a family divided by the American Revolution , Gore sided with the victorious Patriots , established a successful law practice in Boston , and built a fortune by purchasing Revolutionary government debts at a discount and receiving full value for them from the government . 
- Gore entered politics in 1788 , serving briefly in the Massachusetts legislature before being appointed U.S. District Attorney for Massachusetts . He was then appointed by President George Washington to a diplomatic commission dealing with maritime claims in Great Britain . He returned to Massachusetts in 1804 and reentered state politics , running unsuccessfully for governor several times before winning in 1809 . He served one term , losing to Democratic @-@ Republican Elbridge Gerry in 1810 . He was appointed to the US Senate by Governor Caleb Strong in 1813 , where he led opposition to the War of 1812 . 
- Gore invested his fortune in a variety of businesses , including important infrastructure projects such as the Middlesex Canal and a bridge across the Charles River . He was a major investor in the early textile industry , funding the Boston Manufacturing Company and the Merrimack Manufacturing Company , whose business established the city of Lowell , Massachusetts . Gore was involved in a variety of charitable causes , and was a major benefactor of Harvard College , where the first library was named in his honor . His palatial mansion in Waltham , Massachusetts , now known as Gore Place , is one of the finest extant examples of Federalist architecture , and has been declared a National Historic Landmark . 
- 
- = = Early years = = 
- 
- Christopher Gore was born in Boston on September 21 , 1758 , one of many children of Frances and John Gore , a successful merchant and artisan . He was the youngest of their three sons to survive to adulthood . He attended Boston Latin School , and entered Harvard College at the young ( even for the time ) age of thirteen . At the outset of the American Revolutionary War and the Siege of Boston in 1775 , Harvard 's buildings were occupied by the Continental Army , and Gore temporarily continued his studies in Bradford until Harvard could resume operations in Concord . While at Harvard Gore participated in a speaking club , and formed significant lifelong friendships with Rufus King and John Trumbull . 
- Gore graduated in 1776 , and promptly enlisted in the Continental artillery regiment of his brother @-@ in @-@ law Thomas Crafts , where he served as a clerk until 1778 . The Gore family was divided by the war : Gore 's father was a Loyalist who left Boston when the British Army evacuated the city in March 1776 . Gore was consequently called upon to support his mother and three sisters , who remained in Boston . In 1779 Gore successfully petitioned the state for the remaining family 's share of his father 's seized assets . 
- 
- = = Early legal career = = 
- 
- After his military service Gore studied law with John Lowell , and was admitted to the bar in 1778 after a comparatively brief tutelage . Gore 's law practice flourished , in part because many Loyalist lawyers had fled Massachusetts . Gore 's clients included Loyalists seeking to recover some of their assets , as well as London @-@ based British merchants with claims to pursue . His briefs were generally well @-@ reasoned , and he was seen as a successful trial lawyer . 
- Gore grew his fortune by investing carefully in revolutionary currency and bonds . The securities he purchased were paper that had been given to Continental Army soldiers in lieu of pay , which they often sold at a steep discount . One batch of securities he purchased , for instance , cost him about $ 3 @,@ 700 but had a face value of $ 25 @,@ 000 . In 1785 he married Rebecca Amory Payne , daughter of a wealthy merchant , maritime insurer , and director of the Bank of Massachusetts . The couple were known for their social graces and became prominent members of Boston society . 
- In 1786 Gore became concerned about a rise in anti @-@ lawyer sentiment in Massachusetts . Grievances over harsh policies pursued by Governor James Bowdoin blossomed into Shays ' Rebellion , which required militia action to crush in 1787 . Gore was one of several high @-@ profile lawyers assigned to defend participants in the rebellion ( included in this group were Theodore Sedgwick , Caleb Strong , James Sullivan , Levi Lincoln , Sr. , and Thomas Dawes ) . Although many rebels were ultimately convicted , a large number received amnesty . In 1788 , Gore was elected a delegate to the 1789 Massachusetts convention that ratified the United States Constitution . His election was contested because Boston , where he lived , was at the time more inclined toward state power . Gore nonetheless was strongly Federalist , urging support of the new Constitution . 
- 
- = = Legislator , banker , and speculator = = 
- 
- In 1788 Gore was elected to the Massachusetts House of Representatives . He took a leading role in adopting the state 's rules for actions required of it by the new federal constitution . By his proposal the legislature decided that presidential electors would be chosen by a joint session . He also proposed that the state House and Senate agree by separate votes on choices for the United States Senate , a process that would significantly reduce popular input to the choice . His choice was ultimately rejected in favor of a process whereby the House selected a slate of candidates , from which the Senate would choose one . In 1789 Gore decided to stand for reelection , but lost , owing to strong anti @-@ nationalist fervor in Boston at the time . He managed to win a seat later , when a special election was held after resignations opened several seats . 
- Gore 's financial speculations in the late 1780s significantly multiplied his wealth . In 1788 he and Andrew Craigie , a Boston businessman who had retained Gore for legal services , entered into a secret agreement to purchase Continental securities with a face value of $ 100 @,@ 000 in a speculative bid that their value would rise . By late October of that year , the pair had met this goal : Gore had purchased $ 90 @,@ 000 worth of paper for about $ 20 @,@ 000 , and encouraged Craigie to purchase more than the $ 11 @,@ 000 he had acquired if his funding would allow for it . Gore also purchased Massachusetts war @-@ related debts , and lobbied Massachusetts Congressmen for the U.S. government to assume those as well . 
- Gore 's windfall was realized when in 1790 the United States Congress , acting on a proposal made by Alexander Hamilton and supported by Gore 's friend Rufus King , passed legislation that exchanged Continental and state paper for new U.S. paper at face value . Not only did Gore win on this exchange , but the paper he received appreciated in value before he sold it . The exact amount he made is unclear from the surviving documents : John Quincy Adams wrote that Gore 's speculations made him the wealthiest lawyer in the country . 
- The success of Gore 's speculations prompted him to enter a partnership with Craigie , William Duer and Daniel Parker in an attempt to acquire U.S. foreign debt obligations on favorable terms . Parker was a business partner of Craigie 's , and Duer was an influential New York businessman and Treasury Department official whose lavish lifestyle impressed Gore . The partnership promoted sales of U.S. lands in Europe , and sought to acquire U.S. obligations to France . Although Gore sank $ 10 @,@ 000 into this venture , it failed : more powerful and experienced Dutch bankers outmaneuvered the Americans . Gore also engaged in other ventures with these partners , but apparently carefully stayed with financial speculations , and avoided the partners ' less successful land ventures . 
- Much of Gore 's financial activity was mediated through the Bank of Massachusetts , where his father @-@ in @-@ law was a director . Gore himself was elected to its board in 1785 , when he also became a shareholder . During his time on the board the bank tightened its regulations on loan repayments , a move that improved the stability of its capital . Gore used the bank for most of his personal deposits , but also drew on lines of credit for as much as several thousand dollars . The bank shares he held paid relatively high dividends until 1791 , when the bank received serious competition from the First Bank of the United States . 
- The Bank of the United States was established by Alexander Hamilton to provide stable banking services on a national scale , and sought to open a branch in Boston . Hamilton recruited heavily in the Bank of Massachusetts , and Gore decided to make the move . He sold his shares in the Massachusetts bank , and became a director of the Boston branch of the U.S. Bank . He also purchased 200 shares in the new bank , a relatively large investment . Gore was influential in making hiring decisions for the branch , and sought to merge state @-@ chartered banks into the organization , arguing that only a nationally chartered bank could provide consistent and stable service . Gore resigned from the board in 1794 , citing the demands of his law practice . 
- Gore 's financial successes enabled him to join the elite society of Boston . In 1789 he purchased a large mansion on fashionable Bowdoin Square , and also bought a country estate in Waltham that grew over time to 300 acres ( 120 ha ) . He had a house built on the estate , most of which he operated as a gentleman farmer . He and other similarly @-@ situated Federalists formed the Massachusetts Society for Promoting Agriculture , of which he served as a trustee for several years ; the organization was not seen as significantly contributing to advances in agriculture . 
- 
- = = District attorney and diplomat = = 
- 
- In 1789 President George Washington appointed Gore the first United States Attorney for Massachusetts as a reward for his support . Gore controversially refused to resign from the state legislature , arguing that the state constitution 's prohibitions against holding multiple offices did not apply to federal posts . He eventually resigned the legislative seat under protest because of pressure from his fellow legislators . 
- Gore served as district attorney until 1796 . His principal matter of concern was the enforcement of U.S. neutrality with respect to the French Revolutionary Wars . He attempted several times to prosecute the French consul in Boston , Antoine Duplaine , for arming and operating privateers out of the Port of Boston , but he was stymied by local juries that sympathized with the French . Duplaine was eventually expelled on orders from President George Washington based on evidence provided by Gore . 
- Gore also promoted anti @-@ French sentiment with political writings in Massachusetts newspapers . Writing under the pseudonym " Manlius " , he denounced the formation of " Democratic Societies " formed to oppose Federalist policy and support pro @-@ French positions . He suggested to President Washington that someone be sent to England to negotiate with the British . John Jay traveled to London in 1794 and negotiated the Jay Treaty , whose ratification Gore vocally supported . Although Gore was hostile to French policy , he was on friendly terms with individual Frenchmen : he hosted the future French statesman Talleyrand when he visited the U.S. 
- In 1796 Washington appointed him as a commissioner representing the United States to handle maritime claims under the terms the Jay Treaty . As a result the Gores moved to England that year , establishing a residence in the fashionable Hyde Park area . The commission was established to arbitrate claims emanating from British seizures of American vessels and cargoes , and from British claims relating to violations of American neutrality in the ongoing French Revolutionary Wars . It consisted of three Americans ( Gore , William Pinkney , and John Trumbull ) and two British commissioners ( John Nicoll and Nicholas Astley ) ; Trumbull was chosen by the other four because he was deemed to sufficiently " fair @-@ minded " to cast deciding votes in the event of disagreements . That year he was also elected a Fellow of the American Academy of Arts and Sciences . 
- Although Gore was well received by the British establishment , the work suffered from what Gore called a " tediousness of process " , and he considered requesting a transfer in 1798 . In 1800 it ground to a halt because another board established by the treaty to resolve outstanding Revolutionary War claims against the United States had not yet met , and the British stopped the claims processing until resolution of the other issues got underway . Gore used this break to briefly return to America and assess the condition of his Waltham estate , where the house had been largely destroyed by fire in 1799 . After his return to London , with the commission work still stopped , he and Rebecca embarked on a tour of Europe . They visited Holland , Belgium , and Switzerland , and spent six months in Paris . During this trip , and later ones in England and Scotland , they took note of the architecture of country estates , and began planning a new house for their Waltham property . 
- The commission resumed its work in early 1802 , and had resolved all outstanding claims by August 1803 . It awarded $ 110 @,@ 000 to British claimants and over $ 6 million to American claimants . The lopsided result is due to the vastly larger number of American claims , but also to some key early decisions that favored American interpretations in the processing of the claims , and by a British administration that sought to remain in America 's good graces . 
- The Gore 's social circle in England revolved around his good friend Rufus King , who was appointed Ambassador to Great Britain in 1796 , along with other Massachusetts expatriates . When King left his post in May 1803 he named Gore to head the London embassy as chargé d 'affaires . Although President Thomas Jefferson never issued a formal appointment , the British government accepted his role for the two @-@ month interval between King 's departure and the arrival of James Monroe as King 's replacement . The Gores sailed for Boston in the spring of 1804 . 
- Rebecca Gore used their exposure to European country estates to design a lavish new building for their Waltham estate during their English sojourn . Designed with the assistance of French architect Joseph @-@ Guillaume Legrand and probably also influenced by the works of English architect Sir John Soane , the house that was built upon their return to the United States in 1804 ( now known as Gore Place ) is one of the finest extant examples of Federalist architecture . 
- 
- = = Lawyer and state legislator = = 
- 
- Soon after his return to the United States , Gore reentered state politics , winning election to the Massachusetts Senate . He was active in the state Federalist Party organization , sitting on its secret central committee . He resumed his law practice , in which he took on as a student Daniel Webster . One of the highest profile cases he took on was the 1807 defense of Thomas Selfridge , accused of murdering Charles Austin . Selfridge , an older Federalist attorney , had been retained to assist in the collection of a debt from Austin 's Republican father . In the politically charged atmosphere of the day in Boston , Selfridge , fearing for his own safety , had armed himself with a dueling pistol . The younger Austin had , apparently on his own initiative , sought to beat Selfridge with a cane , and Selfridge fatally shot him in the encounter . Selfridge was prosecuted by Attorney General ( and future Gore gubernatorial opponent ) James Sullivan , and the defense also included arch @-@ Federalist Harrison Gray Otis . Gore argued Selfridge acted in self @-@ defense ; Selfridge was acquitted of murder by a jury whose foreman was Patriot and Federalist Paul Revere after fifteen minutes ' deliberation . 
- Gore also resumed business activities upon his return . He invested in a wide variety of businesses and infrastructure , spurring economic activity in the state . His investments ranged widely , including maritime insurance ( where is father @-@ in @-@ law had made his fortune ) , bridges , locks , canals , and textiles . He was a major investor in the Middlesex Canal , the Craigie Bridge ( the first to connect Boston to Cambridge ) , and the Boston Manufacturing Company , whose factory proving the single @-@ site production of textiles was in Waltham near his estate . Not all of his ventures panned out : the canal was in the long run a financial failure , as were efforts with other collaborators to develop Lechmere Point , the Cambridge side of the Craigie Bridge . The textile mill , however , was a success , and Gore invested in the Merrimack Manufacturing Company . When it decided to locate in what is now Lowell , Massachusetts , Gore purchased shares in the Proprietors of Locks and Canals , which operated ( and still owns today ) the Lowell canals . 
- In 1806 Gore won election to the State Senate . That year the Republicans were in the majority , and the election for governor was close enough to require a recount . The legislature scrutinized the ballots in a partisan manner ( for example , retaining ballots containing misspelled versions of Republican James Sullivan 's name and discarding similar ballots marked for Federalist Caleb Strong ) . Gore and other Federalists raised a public outcry , and the legislature relented , eventually certifying Strong as the winner . 
- Gore ran unsuccessfully for Governor of Massachusetts in 1807 and 1808 against a rising tide of Republicanism in the state , losing both times to moderate Republican James Sullivan . The Federalists gained control of the state legislature in 1808 in a backlash against Republican economic policies , but Gore was criticized for his failure to aggressively support state protests against the Embargo Act of 1807 , which had a major negative effect on the state 's large merchant fleet . Gore was in 1808 elected to the Massachusetts House of Representatives , where he successfully led Federalist efforts to ensure the selection of a Federalist slate of presidential electors . He also spearheaded actions to drive Senator John Quincy Adams from the Federalist Party over his support of Thomas Jefferson 's foreign policy . The legislature elected Adams ' successor nine months early , and gave Adams sufficiently distasteful instructions that he resigned the post and joined with the Republicans . 
- 
- = = Governor = = 
- 
- Gore led the Federalists to victory in 1809 against Sullivan 's successor , Levi Lincoln , Sr. , who had taken over as acting governor upon Sullivan 's death late in 1808 . During Gore 's term the principal domestic issue occupying state politics was a banking crisis stimulated by the federal policy of embargoing trade with Great Britain and France , then embroiled in the Napoleonic Wars . Although the crisis caused a number of bank failures in New England , Massachusetts banks largely escaped unscathed . 
- Foreign policy played a major role in Gore 's administration . The legislature passed resolves opposing the federal government 's hardline policy against trade and diplomatic relations with the United Kingdom ( then embroiled in the Napoleonic Wars ) , and Gore in early 1810 invited Francis James Jackson , who had been rejected as the UK 's ambassador to the US , to visit the state . This pressure may have played a role in President James Madison 's decision to renew relations with the UK and accept Jackson 's credentials . 
- The lessening of the war threat , and the choice by the Republicans of the popular Elbridge Gerry as their candidate brought a challenge to Federalist control of Massachusetts in the 1810 elections . The unostentatious Gerry and Republican partisans criticized Gore for his lavish lifestyle , including his palatial Waltham residence and pompous activities he organized as governor , and highlighted his Loyalist family connections while emphasizing Gerry 's unimpeachable patriotism . Gerry won the election . Jackson did visit Boston , but he was greeted not by Gore , but Gerry . Gore ran against Gerry again in 1811 , but lost in another acrimonious campaign . 
- Gore was granted an honorary law degree from Harvard in 1809 . He served on the college 's Board of Overseers from 1810 to 1815 and as a Fellow from 1816 to 1820 . Harvard 's first library building , a Gothic structure built in 1838 of Quincy granite , was named in his honor , but was demolished when Widener Library was built in its place in 1915 . ( This structure is found on the seal of the city of Cambridge . ) One of the residential Winthrop House 's buildings is called Gore Hall in his honor . 
- 
- = = United States Senator = = 
- 
- In the spring of 1813 , he was appointed by Governor Caleb Strong to fill the U.S. Senate seat vacated by the resignation of Senator James Lloyd . He served from May 5 , 1813 to May 30 , 1816 , winning reelection to the seat in 1814 . He opposed the ongoing War of 1812 in these years , with his earlier diplomatic experience providing valuable knowledge to Federalist interests . He expressed approval of the 1814 Hartford Convention in which the New England states aired grievances concerning Republican governance of the country and the conduct of the war . 
- Gore assented to the Treaty of Ghent that ended the war , but was unhappy that the nation had not gained anything from the war . He resigned in June 1816 , unhappy with the politics of Washington and suffering from poor health . Although he was no longer active in politics , he continued to express opinions on the subjects of the day , opposing the 1820 Missouri Compromise and bemoaning the " great moderation & mediocrity " of Federalist Governor John Brooks . 
- 
- = = Later years and legacy = = 
- 
- Gore remained active in the administration of Harvard , and was active in a number of organizations , including the American Academy of Arts and Sciences and the Massachusetts Historical Society ( whose president he was from 1806 to 1818 ) . He was also elected a member of the American Antiquarian Society in 1814 . Gore spent most of his later years at his country estate in Waltham , suffering from worsening rheumatoid arthritis that made walking increasingly difficult . His declining health and lack of social scene in Waltham led him in 1822 to return to Boston in the winters . He died on March 1 , 1827 in Boston and is buried in its Granary Burying Ground . 
- Gore 's wife died in 1834 ; the couple had no children . The major beneficiary of the Gore estate was Harvard ( which received an estimated $ 100 @,@ 000 ) , although bequests were also made to the American Academy of Arts and Sciences and the Massachusetts Historical Society . The Waltham estate passed through several hands and was subdivided over time . The mansion was saved from demolition by the Gore Place Society ( established for the purpose of preserving it ) , which now operates it as a museum . It was declared a National Historic Landmark in 1970 . 
- 
- 
- = Nero = 
- 
- Nero ( / ˈnɪəroʊ / ; Latin : Nerō Claudius Caesar Augustus Germanicus ; 15 December 37 AD – 9 June 68 AD ) was Roman Emperor from 54 to 68 , and the last in the Julio @-@ Claudian dynasty . Nero was adopted by his grand @-@ uncle Claudius to become his heir and successor , and succeeded to the throne in 54 following Claudius ' death . 
- Nero focused much of his attention on diplomacy , trade and enhancing the cultural life of the empire , but according to the historian Tacitus he was viewed by the Roman people as compulsive and corrupt . He ordered theatres built and promoted athletic games . During his reign , the redoubtable general Corbulo conducted a successful war and negotiated peace with the Parthian Empire . His general Suetonius Paulinus crushed a revolt in Britain . Nero annexed the Bosporan Kingdom to the empire and began the First Jewish – Roman War . 
- In 64 AD , most of Rome was destroyed in the Great Fire of Rome , which many Romans believed Nero himself had started in order to clear land for his planned palatial complex , the Domus Aurea . In 68 , the rebellion of Vindex in Gaul and later the acclamation of Galba in Hispania drove Nero from the throne . Facing a false report of being denounced as a public enemy who was to be executed , he committed suicide on 9 June 68 ( the first Roman emperor to do so ) . His death ended the Julio @-@ Claudian dynasty , sparking a brief period of civil wars known as the Year of the Four Emperors . Nero 's rule is often associated with tyranny and extravagance . He is known for many executions , including that of his mother , and the probable murder by poison of his stepbrother Britannicus . 
- Nero was rumored to have had captured Christians dipped in oil and set on fire in his garden at night as a source of light . This view is based on the writings of Tacitus , Suetonius and Cassius Dio , the main surviving sources for Nero 's reign , but a few surviving sources paint Nero in a more favourable light . Some sources , including some mentioned above , portray him as an emperor who was popular with the common Roman people , especially in the East . Some modern historians question the reliability of ancient sources when reporting on Nero 's tyrannical acts . 
- 
- = = Early life = = 
- 
- 
- = = = Family = = = 
- 
- Lucius Domitius Ahenobarbus , Nero , was born on 15 December 37 in Antium ( modern Anzio and Nettuno ) , near Rome . He was the only son of Gnaeus Domitius Ahenobarbus and Agrippina the Younger , sister of Emperor Caligula . 
- Nero 's father , Gnaeus , was the son of Lucius Domitius Ahenobarbus ( consul 16 BC ) and Antonia Major . Gnaeus was thus the grandson of Gnaeus Domitius Ahenobarbus ( consul 32 BC ) and probably Aemilia Lepida on his father 's side , and the grandson of Mark Antony and Octavia Minor on his mother 's side . Thus , Nero had as his paternal grandmother Antonia Major , and also claimed more remote descent from Antonia Minor as a great @-@ grandson — later grandson after Claudius adopted him . 
- Through Octavia , Nero was the great @-@ nephew of Caesar Augustus . Nero 's father had been employed as a praetor and was a member of Caligula 's staff when the latter travelled to the East ( some apparently think Suetonius refers to Augustus 's adopted son Gaius Caesar here , but this is not likely ) . 
- Nero 's father was described by Suetonius as a murderer and a cheat who was charged by Emperor Tiberius with treason , adultery and incest . Tiberius died , allowing him to escape these charges . Nero 's father died of edema ( " dropsy " ) in 39 when Nero was two . 
- Nero 's mother was Agrippina the Younger , a great @-@ granddaughter of Caesar Augustus and his wife Scribonia through their daughter Julia the Elder and her husband Marcus Vipsanius Agrippa . Agrippina 's father , Germanicus , was a grandson of Augustus 's wife , Livia , on one side and to Mark Antony and Octavia on the other . Germanicus ' mother Antonia Minor , was a daughter of Octavia Minor and Mark Antony . Octavia was Augustus ' elder sister . Germanicus was also the adopted son of Tiberius . Agrippina poisoned her second husband Passienus Crispus , so many ancient historians also accuse her of murdering her third husband , the emperor Claudius . 
- 
- = = = Ancestry and family = = = 
- 
- 
- = = = Rise to power = = = 
- 
- Nero was not expected to become Emperor because his maternal uncle , Caligula , had begun his reign at the age of 24 with enough time to produce his own heir . Nero 's mother , Agrippina , lost favour with Caligula and was exiled in 39 after her husband 's death . Caligula seized Nero 's inheritance and sent him to be brought up by his less wealthy aunt , Domitia Lepida , who was the mother of Valeria Messalina , Claudius 's third wife . Caligula , his wife Caesonia and their infant daughter Julia Drusilla were murdered on 24 January 41 . These events led Claudius , Caligula 's uncle , to become emperor . Claudius allowed Agrippina to return from exile . 
- Claudius had married twice before marrying Valeria Messalina . His previous marriages produced three children including a son , Drusus , who died at a young age . He had two children with Messalina – Claudia Octavia ( born 40 ) and Britannicus ( born 41 ) . Messalina was executed by Claudius in the year 48 . 
- In 49 AD , Claudius married a fourth time , to Nero 's mother Agrippina , despite her being his niece . To aid Claudius politically , young Nero was adopted in 50 and took the name Nero Claudius Caesar Drusus Germanicus ( see adoption in Rome ) . Nero was older than his stepbrother Britannicus , and thus became heir to the throne . 
- Nero was proclaimed an adult in 51 at the age of 14 . He was appointed proconsul , entered and first addressed the Senate , made joint public appearances with Claudius , and was featured in coinage . In 53 , he married his stepsister Claudia Octavia . 
- 
- = = Emperor ( 54 – 68 AD ) = = 
- 
- 
- = = = Early rule = = = 
- 
- Claudius died in 54 and Nero , taking the name Nero Claudius Caesar Augustus Germanicus , was established as Emperor . Though accounts vary , many ancient historians state Agrippina poisoned Claudius . According to Pliny the Elder , she used poison mushrooms . It is not known how much Nero knew or if he was even involved in the death of Claudius . 
- Suetonius wrote " ... for even if he was not the instigator of the emperor 's death , he was at least privy to it , as he openly admitted ; for he used afterwards to laud mushrooms , the vehicle in which the poison was administered to Claudius , as " the food of the gods , " as the Greek proverb has it . At any rate , after Claudius ' death he vented on him every kind of insult , in act and word , charging him now with folly and now with cruelty ; for it was a favourite joke of his to say that Claudius had ceased " to play the fool " among mortals , lengthening the first syllable of the word morari , and he disregarded many of his decrees and acts as the work of a madman and a dotard . Finally , he neglected to enclose the place where his body was burned except with a low and mean wall . " 
- According to Suetonius ; Nero became Emperor at the age of 17 when the news of Claudius ' death was made known , making him the youngest emperor at that time Although , what Suetonius may have meant is that he was in his seventeenth year as his date of birth , also listed by Suetonius , would have made him 16 at the time of Claudius ' death . Tacitus , in book XIII of his Annals , describes Nero as being ' scarcely out of his boyhood ' at the time he became emperor . Ancient historians describe Nero 's early reign as being strongly influenced by his mother , Agrippina , his tutor Lucius Annaeus Seneca , and the Praetorian Prefect Sextus Afranius Burrus , especially in the first year . Other tutors were less often mentioned , such as Alexander of Aegae . 
- Very early in Nero 's rule , problems arose from competition for influence between Agrippina and Nero 's two main advisers , Seneca and Burrus . Agrippina also attempted to influence the young Nero . Agrippina also is mentioned by ancient sources as " scheming for her son ( Nero ) " . This scheming continued as is evidenced by the coin of the both of them . It is extremely unusual to see a women 's face on a coin in the ancient world . It is because of this position of power Agrippina felt jealous as Seneca in particular rose up in Nero 's court as he offered the advice Nero wanted to hear unlike his mother . 
- In 54 , Agrippina tried to sit down next to Nero while he met with an Armenian envoy , but Seneca stopped her and prevented a scandalous scene ( as it was unimaginable at that time for a woman to be in the same room as men doing official business ) . Nero 's friends also mistrusted Agrippina and told Nero to beware of his mother . 
- Nero was reportedly unsatisfied with his marriage to Octavia and entered into an affair with Claudia Acte , a former slave . In 55 , Agrippina attempted to intervene in favor of Octavia and demanded that her son dismiss Acte . Nero , with the support of Seneca , resisted the intervention of his mother in his personal affairs . 
- With Agrippina 's influence over her son severed , she reportedly began pushing for Britannicus , Nero 's stepbrother , to become emperor . Nearly fourteen @-@ year @-@ old Britannicus , heir @-@ designate prior to Nero 's adoption , was still legally a minor , but was approaching legal adulthood . According to Tacitus , Agrippina hoped that with her support , Britannicus , being the blood son of Claudius , would be seen as the true heir to the throne by the state over Nero . However , the youth died suddenly and suspiciously on 12 February 55 , the very day before his proclamation as an adult had been set . 
- Nero claimed that Britannicus died from an epileptic seizure , but ancient historians all claim Britannicus ' death came from Nero 's poisoning him . Supposedly , he enlisted the services of Locusta , a woman who specialized in the manufacture of poisons . She devised a mixture to kill Britannicus , but after testing it unsuccessfully on a slave , Nero angrily threatened to have her put to death if she did not come up with something usable . Locusta then devised a new concoction that she promised would " kill swifter than a viper . " 
- Her promise was fulfilled after Britannicus consumed it at a dinner party from water used to cool his wine , which had already been tasted , and succumbed within minutes . After the death of Britannicus , Agrippina was accused of slandering Octavia and Nero ordered her out of the imperial residence . 
- 
- = = = Matricide and consolidation of power = = = 
- 
- Over time , Nero became progressively more powerful , freeing himself of his advisers and eliminating rivals to the throne . In 55 , he removed Marcus Antonius Pallas , an ally of Agrippina , from his position in the treasury . Pallas , along with Burrus , was accused of conspiring against the Emperor to bring Faustus Sulla to the throne . Seneca was accused of having relations with Agrippina and embezzlement . Seneca succeeded in having himself , Pallas and Burrus acquitted . According to Cassius Dio , at this time , Seneca and Burrus reduced their role in governing from careful management to mere moderation of Nero . 
- In 58 , Nero became romantically involved with Poppaea Sabina , the wife of his friend and future emperor Otho . Reportedly because a marriage to Poppaea and a divorce from Octavia did not seem politically feasible with Agrippina alive , Nero ordered the murder of his mother in 59 . A number of modern historians find this an unlikely motive as Nero did not marry Poppaea until 62 . 
- Additionally , according to Suetonius , Poppaea did not divorce her husband until after Agrippina 's death , making it unlikely that the already married Poppaea would be pressing Nero for marriage . Some modern historians theorize that Nero 's execution of Agrippina was prompted by her plotting to set Rubellius Plautus on the throne . According to Suetonius , Nero tried to kill his mother through a shipwreck planned by his freedman tutor Anicetus . Instead , it took the life of Agrippina 's friend , Acerronia Polla . When Agrippina survived , he had her executed by Anicetus and framed it as a suicide . The incident is also recorded by Tacitus . 
- In 62 , Nero 's adviser , Burrus , died . Additionally , Seneca was again faced with embezzlement charges . Seneca asked Nero for permission to retire from public affairs . Nero divorced and banished Octavia on grounds of infertility , leaving him free to marry the pregnant Poppaea . After public protests , Nero was forced to allow Octavia to return from exile , but she was executed shortly after her return . 
- Nero also was reported to have kicked Poppaea to death in 65 before she could have his second child . However , modern historians , noting Suetonius , Tacitus and Cassius Dio 's possible bias against Nero and the likelihood that they did not have eyewitness accounts of private events , postulate that Poppaea may have died because of complications of miscarriage or childbirth . 
- Accusations of treason being plotted against Nero and the Senate first appeared in 62 . The Senate ruled that Antistius , a praetor , should be put to death for speaking ill of Nero at a party . Later , Nero ordered the exile of Fabricius Veiento who slandered the Senate in a book . Tacitus writes that the roots of the conspiracy led by Gaius Calpurnius Piso began in this year . To consolidate power , Nero executed a number of people in 62 and 63 including his rivals Pallas , Rubellius Plautus and Faustus Sulla . According to Suetonius , Nero " showed neither discrimination nor moderation in putting to death whomsoever he pleased " during this period . 
- Nero 's consolidation of power also included a slow usurping of authority from the Senate . In 54 , Nero promised to give the Senate powers equivalent to those under Republican rule . By 65 , senators complained that they had no power left and this led to the Pisonian conspiracy . 
- 
- = = = Other relationships = = = 
- 
- When Nero 's wife Poppaea Sabina died in 65 , Nero went into deep mourning . Her body was not cremated , it was stuffed with spices , embalmed and put in the Mausoleum of Augustus . She was given a state funeral . Nero praised her during the funeral eulogy and gave her divine honors . It is said that Nero " burned ten years ' worth of Arabia 's incense production at her funeral . 
- In the beginning of 66 , he married Statilia Messalina . She was already married when she became Nero 's mistress in 65 AD , with Statilia 's husband being driven to suicide in 66 , so Nero could marry Statilia . She was one of the few of Nero 's courtiers who survived the fall of his reign . 
- In 67 , Nero ordered a young freedman , Sporus , to be castrated and then married him . According to Dion Cassius , Sporus bore an uncanny resemblance to Sabina , and Nero even called him by his dead wife 's name . 
- 
- = = = Administrative policies = = = 
- 
- Over the course of his reign , Nero often made rulings that pleased the lower class . Nero was criticized as being obsessed with personal popularity . 
- Nero began his reign in 54 by promising the Senate more autonomy . In this first year , he forbade others to refer to him with regard to enactments , for which he was praised by the Senate . Nero was known for spending his time visiting brothels and taverns during this period . 
- In 55 , Nero began taking on a more active role as an administrator . He was consul four times between 55 and 60 . During this period , some ancient historians speak fairly well of Nero and contrast it with his later rule . 
- Under Nero , restrictions were put on the amount of bail and fines . Also , fees for lawyers were limited . There was a discussion in the Senate on the misconduct of the freedmen class , and a strong demand was made that patrons should have the right of revoking freedom . Nero supported the freedmen and ruled that patrons had no such right . 
- The Senate tried to pass a law in which the crimes of one slave applied to all slaves within a household . Despite riots from the people , Nero supported the Senate on their measure , and deployed troops to organise the execution of 400 slaves affected by the law . However , he vetoed strong measures against the freedmen affected by the case . 
- After tax collectors were accused of being too harsh to the poor , Nero transferred collection authority to lower commissioners . Nero banned any magistrate or procurator from exhibiting public entertainment for fear that the venue was being used as a method to sway the populace . Additionally , there were many impeachments and removals of government officials along with arrests for extortion and corruption . 
- When further complaints arose that the poor were being overly taxed , Nero attempted to repeal all indirect taxes . The Senate convinced him this action would bankrupt the public treasury . As a compromise , taxes were cut from 4 @.@ 5 % to 2 @.@ 5 % . Additionally , secret government tax records were ordered to become public . To lower the cost of food imports , merchant ships were declared tax @-@ exempt . 
- In imitation of the Greeks , Nero built a number of gymnasiums and theatres . Enormous gladiatorial shows were also held . Nero also established the quinquennial Neronia . The festival included games , poetry , and theater . Historians indicate that there was a belief that theatre led to immorality . Others considered that to have performers dressed in Greek clothing was old fashioned . Some questioned the large public expenditure on entertainment . 
- In 64 , Rome burned . Nero enacted a public relief effort as well as significant reconstruction . A number of other major construction projects occurred in Nero 's late reign . Nero had the marshes of Ostia filled with rubble from the fire . He erected the large Domus Aurea . In 67 , Nero attempted to have a canal dug at the Isthmus of Corinth . Ancient historians state that these projects and others exacerbated the drain on the State 's budget . 
- The cost to rebuild Rome was immense , requiring funds the state treasury did not have . Nero devalued the Roman currency for the first time in the Empire 's history . He reduced the weight of the denarius from 84 per Roman pound to 96 ( 3 @.@ 85 grams to 3 @.@ 35 grams ) . He also reduced the silver purity from 99 @.@ 5 % to 93 @.@ 5 % — the silver weight dropping from 3 @.@ 83 grams to 3 @.@ 4 grams . Furthermore , Nero reduced the weight of the aureus from 40 per Roman pound to 45 ( 8 grams to 7 @.@ 2 grams ) . 
- Between 62 and 67 , according to Plinius the Elder and Seneca , Nero promoted an expedition to discover the sources of the Nile River . It was the first exploration of equatorial Africa from Europe in history . However , Nero 's expedition up the Nile failed upon reaching the impenetrable Sudd of present @-@ day South Sudan . 
- The economic policy of Nero is a point of debate among scholars . According to ancient historians , Nero 's construction projects were overly extravagant and the large number of expenditures under Nero left Italy " thoroughly exhausted by contributions of money " with " the provinces ruined . " Modern historians , though , note that the period was riddled with deflation and that it is likely that Nero 's spending came in the form of public works projects and charity intended to ease economic troubles . 
- 
- = = = Great Fire of Rome ( 64 AD ) = = = 
- 
- The Great Fire of Rome erupted on the night of 18 July to 19 July 64 . The fire started at the southeastern end of the Circus Maximus in shops selling flammable goods . 
- The extent of the fire is uncertain . According to Tacitus , who was nine at the time of the fire , it spread quickly and burned for over five days . It destroyed three of fourteen Roman districts and severely damaged seven . The only other historian who lived through the period and mentioned the fire is Pliny the Elder , who wrote about it in passing . Other historians who lived through the period ( including Josephus , Dio Chrysostom , Plutarch and Epictetus ) make no mention of it in what remains of their work . 
- It is uncertain who or what actually caused the fire — whether accident or arson . Suetonius and Cassius Dio favor Nero as the arsonist , so he could build a palatial complex . Tacitus mentions that Christians confessed to the crime , but it is not known whether these confessions were induced by torture . However , accidental fires were common in ancient Rome . In fact , Rome suffered other large fires in 69 and in 80 . 
- It was said by Suetonius and Cassius Dio that Nero sang the " Sack of Ilium " in stage costume while the city burned . Popular legend claims that Nero played the fiddle at the time of the fire , an anachronism based merely on the concept of the lyre , a stringed instrument associated with Nero and his performances . ( The fiddle was not invented until the 10th century . ) Tacitus 's account , however , has Nero in Antium at the time of the fire . Tacitus also said that Nero playing his lyre and singing while the city burned was only rumor . 
- According to Tacitus , upon hearing news of the fire , Nero returned to Rome to organize a relief effort , which he paid for from his own funds . Nero 's contributions to the relief extended to personally taking part in the search for and rescue of victims of the blaze , spending days searching the debris without even his bodyguards . After the fire , Nero opened his palaces to provide shelter for the homeless , and arranged for food supplies to be delivered in order to prevent starvation among the survivors . 
- In the wake of the fire , he made a new urban development plan . Houses after the fire were spaced out , built in brick , and faced by porticos on wide roads . Nero also built a new palace complex known as the Domus Aurea in an area cleared by the fire . This included lush artificial landscapes and a 30 @-@ meter @-@ tall statue of himself , the Colossus of Nero . The size of this complex is debated ( from 100 to 300 acres ) . To find the necessary funds for the reconstruction , tributes were imposed on the provinces of the empire . 
- Tacitus , in one of the earliest non @-@ Christian references to the origins of Christianity , notes that the population searched for a scapegoat and rumors held Nero responsible . To deflect blame , Nero targeted Christians . He ordered Christians to be thrown to dogs , while others were crucified and burned . 
- 
- = = = Public performances = = = 
- 
- Nero enjoyed driving a one @-@ horse chariot , singing to the lyre and poetry . He even composed songs that were performed by other entertainers throughout the empire . At first , Nero only performed for a private audience . 
- In 64 AD . , Nero began singing in public in Neapolis in order to improve his popularity . He also sang at the second quinquennial Neronia in 65 . It was said that Nero craved the attention , but historians also write that Nero was encouraged to sing and perform in public by the Senate , his inner circle and the people . Ancient historians strongly criticize his choice to perform , calling it shameful . 
- Nero was persuaded to participate in the Olympic Games of 67 in order to improve relations with Greece and display Roman dominance . As a competitor , Nero raced a ten @-@ horse chariot and nearly died after being thrown from it . He also performed as an actor and a singer . Though Nero faltered in his racing ( in one case , dropping out entirely before the end ) and acting competitions , he won these crowns nevertheless and paraded them when he returned to Rome . The victories are attributed to Nero bribing the judges and his status as emperor . 
- 
- = = = War and peace with Parthia = = = 
- 
- Shortly after Nero 's accession to the throne in 54 , the Roman vassal kingdom of Armenia overthrew their Iberian prince Rhadamistus and he was replaced with the Parthian prince Tiridates . This was seen as a Parthian invasion of Roman territory . There was concern in Rome over how the young Emperor would handle the situation . Nero reacted by immediately sending the military to the region under the command of Gnaeus Domitius Corbulo . The Parthians temporarily relinquished control of Armenia to Rome . 
- The peace did not last and full @-@ scale war broke out in 58 . The Parthian king Vologases I refused to remove his brother Tiridates from Armenia . The Parthians began a full @-@ scale invasion of the Armenian kingdom . Commander Corbulo responded and repelled most of the Parthian army that same year . Tiridates retreated and Rome again controlled most of Armenia . 
- Nero was acclaimed in public for this initial victory . Tigranes , a Cappadocian noble raised in Rome , was installed by Nero as the new ruler of Armenia . Corbulo was appointed governor of Syria as a reward . 
- In 62 , Tigranes invaded the Parthian province of Adiabene . Again , Rome and Parthia were at war and this continued until 63 . Parthia began building up for a strike against the Roman province of Syria . Corbulo tried to convince Nero to continue the war , but Nero opted for a peace deal instead . There was anxiety in Rome about eastern grain supplies and a budget deficit . 
- The result was a deal where Tiridates again became the Armenian king , but was crowned in Rome by Emperor Nero . In the future , the king of Armenia was to be a Parthian prince , but his appointment required approval from the Romans . Tiridates was forced to come to Rome and partake in ceremonies meant to display Roman dominance . 
- This peace deal of 63 was a considerable victory for Nero politically . Nero became very popular in the eastern provinces of Rome and with the Parthians as well . The peace between Parthia and Rome lasted 50 years until Emperor Trajan of Rome invaded Armenia in 114 . 
- 
- = = = Other major power struggles and rebellions = = = 
- 
- The war with Parthia was not Nero 's only major war but he was both criticized and praised for an aversion to battle . Like many emperors , Nero faced a number of rebellions and power struggles within the empire . 
- British Revolt of 60 – 61 ( Boudica 's Uprising ) 
- In 60 , a major rebellion broke out in the province of Britannia . While the governor Gaius Suetonius Paulinus and his troops were busy capturing the island of Mona ( Anglesey ) from the druids , the tribes of the southeast staged a revolt led by queen Boudica of the Iceni . Boudica and her troops destroyed three cities before the army of Paulinus could return , receive reinforcements , and quell the rebellion in 61 . Fearing Paulinus himself would provoke further rebellion , Nero replaced him with the more passive Publius Petronius Turpilianus . 
- The Pisonian Conspiracy of 65 
- In 65 , Gaius Calpurnius Piso , a Roman statesman , organized a conspiracy against Nero with the help of Subrius Flavus and Sulpicius Asper , a tribune and a centurion of the Praetorian Guard . According to Tacitus , many conspirators wished to " rescue the state " from the emperor and restore the Republic . The freedman Milichus discovered the conspiracy and reported it to Nero 's secretary , Epaphroditos . As a result , the conspiracy failed and its members were executed including Lucan , the poet . Nero 's previous advisor , Seneca was ordered to commit suicide after admitting he discussed the plot with the conspirators . 
- The First Jewish War of 66 – 70 
- In 66 , there was a Jewish revolt in Judea stemming from Greek and Jewish religious tension . In 67 , Nero dispatched Vespasian to restore order . This revolt was eventually put down in 70 , after Nero 's death . This revolt is famous for Romans breaching the walls of Jerusalem and destroying the Second Temple of Jerusalem . 
- 
- = = = The revolt of Vindex and Galba and the death of Nero = = = 
- 
- In March 68 , Gaius Julius Vindex , the governor of Gallia Lugdunensis , rebelled against Nero 's tax policies . Lucius Verginius Rufus , the governor of Germania Superior , was ordered to put down Vindex 's rebellion . In an attempt to gain support from outside his own province , Vindex called upon Servius Sulpicius Galba , the governor of Hispania Tarraconensis , to join the rebellion and further , to declare himself emperor in opposition to Nero . 
- At the Battle of Vesontio in May 68 , Verginius ' forces easily defeated those of Vindex and the latter committed suicide . However , after putting down this one rebel , Verginius ' legions attempted to proclaim their own commander as Emperor . Verginius refused to act against Nero , but the discontent of the legions of Germany and the continued opposition of Galba in Spain did not bode well for him . 
- While Nero had retained some control of the situation , support for Galba increased despite his being officially declared a public enemy . The prefect of the Praetorian Guard , Gaius Nymphidius Sabinus , also abandoned his allegiance to the Emperor and came out in support for Galba . 
- In response , Nero fled Rome with the intention of going to the port of Ostia and , from there , to take a fleet to one of the still @-@ loyal eastern provinces . According to Suetonius , Nero abandoned the idea when some army officers openly refused to obey his commands , responding with a line from Vergil 's Aeneid : " Is it so dreadful a thing then to die ? " Nero then toyed with the idea of fleeing to Parthia , throwing himself upon the mercy of Galba , or to appeal to the people and beg them to pardon him for his past offences " and if he could not soften their hearts , to entreat them at least to allow him the prefecture of Egypt " . Suetonius reports that the text of this speech was later found in Nero 's writing desk , but that he dared not give it from fear of being torn to pieces before he could reach the Forum . 
- Nero returned to Rome and spent the evening in the palace . After sleeping , he awoke at about midnight to find the palace guard had left . Dispatching messages to his friends ' palace chambers for them to come , he received no answers . Upon going to their chambers personally , he found them all abandoned . When he called for a gladiator or anyone else adept with a sword to kill him , no one appeared . He cried , " Have I neither friend nor foe ? " and ran out as if to throw himself into the Tiber . 
- Returning , Nero sought for some place where he could hide and collect his thoughts . An imperial freedman , Phaon , offered his villa , located 4 miles outside the city . Travelling in disguise , Nero and four loyal freedmen , Epaphroditos , Phaon , Neophytus , and Sporus , reached the villa , where Nero ordered them to dig a grave for him . 
- At this time , a courier arrived with a report that the Senate had declared Nero a public enemy and that it was their intention to execute him by beating him to death and that armed men had been sent to apprehend him for the act to take place in the Forum . The Senate actually was still reluctant and deliberating on the right course of action as Nero was the last member of the Julio @-@ Claudian Family . Indeed , most of the senators had served the imperial family all their lives and felt a sense of loyalty to the deified bloodline , if not to Nero himself . The men actually had the goal of returning Nero back to the Senate , where the Senate hoped to work out a compromise with the rebelling governors that would preserve Nero 's life , so that at least a future heir to the dynasty could be produced . 
- Nero , however , did not know this , and at the news brought by the courier , he prepared himself for suicide , pacing up and down muttering " Qualis artifex pereo " which translates to English as " What an artist dies in me . " Losing his nerve , he first begged for one of his companions to set an example by first killing himself . At last , the sound of approaching horsemen drove Nero to face the end . However , he still could not bring himself to take his own life but instead he forced his private secretary , Epaphroditos , to perform the task . 
- When one of the horsemen entered , upon his seeing Nero all but dead he attempted to stop the bleeding in vain . Nero 's final words were " Too late ! This is fidelity ! " He died on 9 June 68 , the anniversary of the death of Octavia , and was buried in the Mausoleum of the Domitii Ahenobarbi , in what is now the Villa Borghese ( Pincian Hill ) area of Rome . 
- With his death , the Julio @-@ Claudian dynasty ended . The Senate , when news of his death reached Rome , posthumously declared Nero a public enemy to appease the coming Galba ( as the Senate had initially declared Galba as a public enemy ) and proclaimed Galba the new emperor . Chaos would ensue in the year of the Four Emperors . 
- 
- = = = Post mortem = = = 
- 
- According to Suetonius and Cassius Dio , the people of Rome celebrated the death of Nero . Tacitus , though , describes a more complicated political environment . Tacitus mentions that Nero 's death was welcomed by Senators , nobility and the upper class . The lower @-@ class , slaves , frequenters of the arena and the theater , and " those who were supported by the famous excesses of Nero " , on the other hand , were upset with the news . Members of the military were said to have mixed feelings , as they had allegiance to Nero , but were bribed to overthrow him . 
- Eastern sources , namely Philostratus II and Apollonius of Tyana , mention that Nero 's death was mourned as he " restored the liberties of Hellas with a wisdom and moderation quite alien to his character " and that he " held our liberties in his hand and respected them . " 
- Modern scholarship generally holds that , while the Senate and more well @-@ off individuals welcomed Nero 's death , the general populace was " loyal to the end and beyond , for Otho and Vitellius both thought it worthwhile to appeal to their nostalgia . " 
- Nero 's name was erased from some monuments , in what Edward Champlin regards as an " outburst of private zeal " . Many portraits of Nero were reworked to represent other figures ; according to Eric R. Varner , over fifty such images survive . This reworking of images is often explained as part of the way in which the memory of disgraced emperors was condemned posthumously ( see damnatio memoriae ) . Champlin , however , doubts that the practice is necessarily negative and notes that some continued to create images of Nero long after his death . 
- The civil war during the year of the Four Emperors was described by ancient historians as a troubling period . According to Tacitus , this instability was rooted in the fact that emperors could no longer rely on the perceived legitimacy of the imperial bloodline , as Nero and those before him could . Galba began his short reign with the execution of many allies of Nero and possible future enemies . One such notable enemy included Nymphidius Sabinus , who claimed to be the son of Emperor Caligula . 
- Otho overthrew Galba . Otho was said to be liked by many soldiers because he had been a friend of Nero 's and resembled him somewhat in temperament . It was said that the common Roman hailed Otho as Nero himself . Otho used " Nero " as a surname and reerected many statues to Nero . Vitellius overthrew Otho . Vitellius began his reign with a large funeral for Nero complete with songs written by Nero . 
- After Nero 's suicide in 68 , there was a widespread belief , especially in the eastern provinces , that he was not dead and somehow would return . This belief came to be known as the Nero Redivivus Legend . 
- The legend of Nero 's return lasted for hundreds of years after Nero 's death . Augustine of Hippo wrote of the legend as a popular belief in 422 . 
- At least three Nero imposters emerged leading rebellions . The first , who sang and played the cithara or lyre and whose face was similar to that of the dead emperor , appeared in 69 during the reign of Vitellius . After persuading some to recognize him , he was captured and executed . Sometime during the reign of Titus ( 79 – 81 ) , another impostor appeared in Asia and sang to the accompaniment of the lyre and looked like Nero but he , too , was killed . Twenty years after Nero 's death , during the reign of Domitian , there was a third pretender . He was supported by the Parthians , who only reluctantly gave him up , and the matter almost came to war . 
- 
- = = Physical appearance = = 
- 
- In his book The Lives of the Twelve Caesars , Suetonius describes Nero as " about the average height , his body marked with spots and malodorous , his hair light blonde , his features regular rather than attractive , his eyes blue and somewhat weak , his neck over thick , his belly prominent , and his legs very slender . " 
- 
- = = Historiography = = 
- 
- The history of Nero 's reign is problematic in that no historical sources survived that were contemporary with Nero . These first histories at one time did exist and were described as biased and fantastical , either overly critical or praising of Nero . The original sources were also said to contradict on a number of events . Nonetheless , these lost primary sources were the basis of surviving secondary and tertiary histories on Nero written by the next generations of historians . A few of the contemporary historians are known by name . Fabius Rusticus , Cluvius Rufus and Pliny the Elder all wrote condemning histories on Nero that are now lost . There were also pro @-@ Nero histories , but it is unknown who wrote them or for what deeds Nero was praised . 
- The bulk of what is known of Nero comes from Tacitus , Suetonius and Cassius Dio , who were all of the senatorial class . Tacitus and Suetonius wrote their histories on Nero over fifty years after his death , while Cassius Dio wrote his history over 150 years after Nero 's death . These sources contradict on a number of events in Nero 's life including the death of Claudius , the death of Agrippina , and the Roman fire of 64 , but they are consistent in their condemnation of Nero . 
- A handful of other sources also add a limited and varying perspective on Nero . Few surviving sources paint Nero in a favourable light . Some sources , though , portray him as a competent emperor who was popular with the Roman people , especially in the east . 
- Cassius Dio 
- Cassius Dio ( c . 155 – 229 ) was the son of Cassius Apronianus , a Roman senator . He passed the greater part of his life in public service . He was a senator under Commodus and governor of Smyrna after the death of Septimius Severus ; and afterwards suffect consul around 205 , and also proconsul in Africa and Pannonia . 
- Books 61 – 63 of Dio 's Roman History describe the reign of Nero . Only fragments of these books remain and what does remain was abridged and altered by John Xiphilinus , an 11th @-@ century monk . 
- Dio Chrysostom 
- Dio Chrysostom ( c . 40 – 120 ) , a Greek philosopher and historian , wrote the Roman people were very happy with Nero and would have allowed him to rule indefinitely . They longed for his rule once he was gone and embraced imposters when they appeared : 
- Indeed the truth about this has not come out even yet ; for so far as the rest of his subjects were concerned , there was nothing to prevent his continuing to be Emperor for all time , seeing that even now everybody wishes he were still alive . And the great majority do believe that he still is , although in a certain sense he has died not once but often along with those who had been firmly convinced that he was still alive . 
- Epictetus 
- Epictetus ( c . 55 – 135 ) was the slave to Nero 's scribe Epaphroditos . He makes a few passing negative comments on Nero 's character in his work , but makes no remarks on the nature of his rule . He describes Nero as a spoiled , angry and unhappy man . 
- Josephus 
- The historian Josephus ( c . 37 – 100 ) , while calling Nero a tyrant , was also the first to mention bias against Nero . Of other historians , he said : 
- But I omit any further discourse about these affairs ; for there have been a great many who have composed the history of Nero ; some of which have departed from the truth of facts out of favour , as having received benefits from him ; while others , out of hatred to him , and the great ill @-@ will which they bore him , have so impudently raved against him with their lies , that they justly deserve to be condemned . Nor do I wonder at such as have told lies of Nero , since they have not in their writings preserved the truth of history as to those facts that were earlier than his time , even when the actors could have no way incurred their hatred , since those writers lived a long time after them . 
- Lucan 
- Though more of a poet than historian , Lucanus ( c . 39 – 65 ) has one of the kindest accounts of Nero 's rule . He writes of peace and prosperity under Nero in contrast to previous war and strife . Ironically , he was later involved in a conspiracy to overthrow Nero and was executed . 
- Philostratus 
- Philostratus II " the Athenian " ( c . 172 – 250 ) spoke of Nero in the Life of Apollonius Tyana ( Books 4 – 5 ) . Though he has a generally bad or dim view of Nero , he speaks of others ' positive reception of Nero in the East . 
- Pliny the Elder 
- The history of Nero by Pliny the Elder ( c . 24 – 79 ) did not survive . Still , there are several references to Nero in Pliny 's Natural Histories . Pliny has one of the worst opinions of Nero and calls him an " enemy of mankind . " 
- Plutarch 
- Plutarch ( c . 46 – 127 ) mentions Nero indirectly in his account of the Life of Galba and the Life of Otho . Nero is portrayed as a tyrant , but those that replace him are not described as better . 
- Seneca the Younger 
- It is not surprising that Seneca ( c . 4 BC – 65 ) , Nero 's teacher and advisor , writes very well of Nero . 
- Suetonius 
- Suetonius ( c . 69 – 130 ) was a member of the equestrian order , and he was the head of the department of the imperial correspondence . While in this position , Suetonius started writing biographies of the emperors , accentuating the anecdotal and sensational aspects . 
- Tacitus 
- The Annals by Tacitus ( c . 56 – 117 ) is the most detailed and comprehensive history on the rule of Nero , despite being incomplete after the year 66 . Tacitus described the rule of the Julio @-@ Claudian emperors as generally unjust . He also thought that existing writing on them was unbalanced : 
- The histories of Tiberius , Caius , Claudius and Nero , while they were in power , were falsified through terror , and after their death were written under the irritation of a recent hatred . 
- Tacitus was the son of a procurator , who married into the elite family of Agricola . He entered his political life as a senator after Nero 's death and , by Tacitus ' own admission , owed much to Nero 's rivals . Realising that this bias may be apparent to others , Tacitus protests that his writing is true . 
- Girolamo Cardano 
- In 1562 Girolamo Cardano published in Basel his Encomium Neronis , which was one of the first historical references of the Modern era to portray Nero in a positive light . 
- 
- = = Nero in Jewish and Christian tradition = = 
- 
- 
- = = = Jewish tradition = = = 
- 
- At the end of 66 , conflict broke out between Greeks and Jews in Jerusalem and Caesarea . According to the Talmud , Nero went to Jerusalem and shot arrows in all four directions . All the arrows landed in the city . He then asked a passing child to repeat the verse he had learned that day . The child responded , " I will lay my vengeance upon Edom by the hand of my people Israel " ( Ez . 25 @,@ 14 ) . Nero became terrified , believing that God wanted the Temple in Jerusalem to be destroyed , but would punish the one to carry it out . Nero said , " He desires to lay waste His House and to lay the blame on me , " whereupon he fled and converted to Judaism to avoid such retribution . Vespasian was then dispatched to put down the rebellion . 
- The Talmud adds that the sage Reb Meir Baal HaNess , Rabbi Meir or Rabbi Meir Baal HaNes ( Rabbi Meir the miracle maker ) was a Jewish sage who lived in the time of the Mishna a prominent supporter of the Bar Kokhba rebellion against Roman rule . He was considered one of the greatest of the Tannaim of the third generation ( 139 @-@ 163 ) . According to the Talmud , his father was a descendant of the Roman Emperor Nero who had converted to Judaism . His wife Bruriah is one of the few women cited in the Gemara . He is the third most frequently mentioned sage in the Mishnah . 
- Roman and Greek sources nowhere report Nero 's alleged trip to Jerusalem or his alleged conversion to Judaism . There is also no record of Nero having any offspring who survived infancy : his only recorded child , Claudia Augusta , died aged 4 months . 
- 
- = = = Christian tradition = = = 
- 
- Non @-@ Christian historian Tacitus describes Nero extensively torturing and executing Christians after the fire of 64 . Suetonius also mentions Nero punishing Christians , though he does so because they are " given to a new and mischievous superstition " and does not connect it with the fire . 
- Christian writer Tertullian ( c . 155 – 230 ) was the first to call Nero the first persecutor of Christians . He wrote , " Examine your records . There you will find that Nero was the first that persecuted this doctrine " . Lactantius ( c . 240 – 320 ) also said that Nero " first persecuted the servants of God " . as does Sulpicius Severus . However , Suetonius writes that , " since the Jews constantly made disturbances at the instigation of Chrestus , he [ emperor Claudius ] expelled them from Rome " ( " Iudaeos impulsore Chresto assidue tumultuantis Roma expulit " ) . These expelled " Jews " may have been early Christians , although Suetonius is not explicit . Nor is the Bible explicit , calling Aquila of Pontus and his wife , Priscilla , both expelled from Italy at the time , " Jews " . 
- 
- = = = = Martyrdoms of Peter and Paul = = = = 
- 
- The first text to suggest that Nero ordered the execution of an apostle is a letter by Clement to the Corinthians traditional dated to around 96 A.D. The apocryphal Ascension of Isaiah , a Christian writing from the 2nd century says , " the slayer of his mother , who himself ( even ) this king , will persecute the plant which the Twelve Apostles of the Beloved have planted . Of the Twelve one will be delivered into his hands " was interpreted to mean Nero . 
- Bishop Eusebius of Caesarea ( c . 275 – 339 ) was the first to write explicitly that Paul was beheaded in Rome during the reign of Nero . He states that Nero 's persecution led to Peter and Paul 's deaths , but that Nero did not give any specific orders . However , several other accounts going back to the 1st century have Paul surviving his two years in Rome and travelling to Hispania , before facing trial in Rome again prior to his death . 
- Peter is first said to have been crucified upside @-@ down in Rome during Nero 's reign ( but not by Nero ) in the apocryphal Acts of Peter ( c . 200 ) . The account ends with Paul still alive and Nero abiding by God 's command not to persecute any more Christians . 
- By the 4th century , a number of writers were stating that Nero killed Peter and Paul . 
- 
- = = = = The Antichrist = = = = 
- 
- The Sibylline Oracles , Book 5 and 8 , written in the 2nd century , speak of Nero returning and bringing destruction . Within Christian communities , these writings , along with others , fueled the belief that Nero would return as the Antichrist . In 310 , Lactantius wrote that Nero " suddenly disappeared , and even the burial place of that noxious wild beast was nowhere to be seen . This has led some persons of extravagant imagination to suppose that , having been conveyed to a distant region , he is still reserved alive ; and to him they apply the Sibylline verses " , Lactantius maintains that it is not right to believe this . 
- In 422 , Augustine of Hippo wrote about 2 Thessalonians 2 : 1 – 11 , where he believed Paul mentioned the coming of the Antichrist . Though he rejects the theory , Augustine mentions that many Christians believed that Nero was the Antichrist or would return as the Antichrist . He wrote , " so that in saying , ' For the mystery of iniquity doth already work , ' he alluded to Nero , whose deeds already seemed to be as the deeds of Antichrist . " 
- Some modern biblical scholars such as Delbert Hillers ( Johns Hopkins University ) of the American Schools of Oriental Research and the editors of the Oxford & Harper Collins Study Bibles , contend that the number 666 in the Book of Revelation is a code for Nero , a view that is also supported in Roman Catholic Biblical commentaries . 
- The concept of Nero as the Antichrist is often a central belief of Preterist eschatology . 
- 
- 
- = Manila = 
- 
- Manila ( / məˈnɪl.ə / ) is the capital city of the Philippines , founded on June 24 , 1571 by Spanish conquistador Miguel López de Legazpi . It is one of the oldest cities in the country and was the seat of power for most of the colonial rules of the Philippines . It is situated on the eastern shore of Manila Bay and contains a multitude of landmarks , some of which date back to the 16th century , such as the Spanish colonial era Walled City of Intramuros . 
- Manila is the second most populous city in the Philippines after the former capital Quezon City with a population of 1 @,@ 780 @,@ 148 in 2015 . Because of its small land area and huge population , Manila is regarded as one of the most densely populated cities in the world with 42 @,@ 857 people per square kilometer . Manila is one of the sixteen cities and a municipality that make up Metro Manila , the National Capital Region of the Philippines . In 2012 , Globalization and World Cities Research Network listed Manila as a global city . 
- Manila has six representative districts for the lower house of the Philippine Congress . Furthermore , the city is composed of 16 districts , namely : Binondo , Ermita , Intramuros , Malate , Paco , Pandacan , Port Area , Quiapo , Sampaloc , San Andres , San Miguel , San Nicolas , Santa Ana , Santa Cruz , Santa Mesa and Tondo . 
- The Kingdom of Tondo once ruled in the vicinity of Manila before it briefly became a province of the Hindu Majapahit Empire . During the Brunei invasion of the Philippines , Sultan Bolkiah of Brunei captured Seludong ( a village in modern @-@ day Manila ) and renamed it Maynilà , a Tagalog term referring to the presence of the Nila shrub . Maynila was a vassal state of Brunei , established to overpower Tondo . Maynilà had been Indianized since the sixth century CE and earlier . It had become partly Islamic and Hindu @-@ animist by the 15th century CE . 
- In 1571 Spanish Conquistadors arrived from Mexico , from across the Pacific , and founded present @-@ day Manila in what today is Intramuros . Spanish missionaries soon Christianized the city and incorporated Tondo under Manila and then built some of the oldest churches in the country , including San Agustin Church . The Conquistadors renamed the area Nuevo Reino de Castilla ( New Kingdom of Castille ) and shortened the name to Manila . 
- Manila became the center of Spanish activity in the Far East and one end of the Manila @-@ Acapulco Galleon trade route , linking Spanish America with Asia , one of the earliest examples of globalization . Due to the central location in the Pacific sea trade routes , Manila received the moniker of the " Pearl of the Orient " . Spanish rule of Manila and the entire Philippine archipelago lasted for over three centuries , until 1898 . At different times during the long Spanish period there were local revolts , Chinese insurrections , massive pirate attacks , great earthquakes , Dutch raids and invasion attempts , and a British occupation of the city during their unsuccessful attempt to conquer the Philippines . Order was usually quickly restored and the city returned to the business of trade . In the 19th century Manila was one of the most modern cities in Asia . Before the Spanish – American War , Manila saw the rise of the Philippine Revolution . Under the American rule following the Spanish – American War , the United States changed the official language from Spanish to English and made some changes in education , local laws and urban planning . Towards the end of World War II , during the Battle of Manila most of the city was flattened by intensive aerial bombardment by the United States Air Force . As a result , relatively little remains of Manila 's prewar and colonial architecture , although there are ongoing restoration projects , especially within the old walled city , Intramuros . 
- 
- = = History = = 
- 
- The earliest evidence of human life in and around the area of Manila is the nearby Angono Petroglyphs dated to around 3000 BC . Furthermore , negritos , a class of Australoid peoples , became the aboriginal inhabitants of the Philippines . They were found across Luzon before the Malayo @-@ Polynesians migrated in and assimilated them . 
- The Kingdom of Tondo flourished during the latter half of the Ming Dynasty as a result of direct trade relations with China . Tondo district was maintained as the traditional capital of the empire , with its rulers as sovereign kings and not mere chieftains , and were addressed variously as panginuan ln Meranau or panginoón in Tagalog ( " lords " ) ; anák banwa ( " son of heaven " ) ; or lakandula ( " lord of the palace " ) , the Emperor of China considered the Lakans ( rulers of ancient Manila ) " 王 " ( Kings ) . 
- In the 13th century , Manila consisted of a fortified settlement and trading quarter at the shores of the Pasig River , on top of previous older towns . Manila was then settled by the Indianized empire of Majapahit as referenced in the epic eulogy poem Nagarakretagama which inscribed its conquest by Maharaja Hayam Wuruk . Selurong " षेलुरोन ् ग ् " which is a historical name for the city of Manila is listed in Canto 14 alongside Sulot , which is now Sulu , and Kalka . 
- During the reign of Sultan Bolkiah from 1485 to 1521 , the Bruneian Empire invaded , wanting to take advantage of Tondo 's China trade by attacking its environs and establishing " كوتا سلودوڠ Kota Saludong " ( The Kingdom of Maynila ) . They ruled under and gave yearly tribute to the Sultanate of Brunei as its satellite state . They established a new dynasty under the local leader who accepted Islam and became Rajah Salalila or Tariq Sulayman I. He also established a trading challenge to the already rich House of Lakan Dula in Tondo . Islam was further strengthened by the arrival of Muslim traders from the Arab @-@ Indian area and Southeast Asia . Manila was temporarily besieged by the invasion of Chinese pirate @-@ warlord Limahong ( 1574 ) but was thwarted by the local inhabitants , before it became the seat of the colonial government of Spain . 
- On June 24 , 1571 , Spanish conquistador Miguel López de Legazpi arrived from New Spain ( now Mexico ) , and then exercised rule of the Spanish city of Manila as a territory of New Spain with the establishment of a city council in what today is the district of Intramuros . López de Legazpi had the local royalty executed or exiled , after the failure of the Tondo Conspiracy ; a plot wherein an alliance between Japanese merchants , Luzon 's Huangs with several Datus and Rajahs plus the Bruneian Empire would band together to execute the Spaniards and their Latin @-@ American mercenaries , and Visayan allies . At the conclusion of which , the victorious Spaniards made Manila the capital of the Spanish East Indies and of the Philippines , which the empire would control for the next three centuries , from 1565 to 1898 . 
- Manila then became famous during the Manila @-@ Acapulco Galleon trade which lasted for three centuries and brought goods from Europe , Africa and Latin America across the Pacific Islands to Southeast Asia ( Which was already an entrepot for goods coming from India , Indonesia and China ) and trade also flowed vice versa . Silver that was mined in Mexico and Peru were exchanged for Chinese silk , Indian gems , and the spices of the Southeast Asia , some of which even flowed to Europe . Likewise wines and olives grown from Europe and North Africa were transshipped via Mexico towards Manila . 
- Manila was occupied by British forces for twenty months , from 1762 to 1764 , and used as a base for an unsuccessful attempt to conquer the Philippines during the Seven Years ' War . Eventually , the British withdrew from Manila as per agreements in the 1763 Treaty of Paris . The Chinese were punished for supporting the British invasion , and the small fortress @-@ city of Intramuros , mostly populated by Europeans and Mexicans , kept its cannons pointed at Binondo , the world 's oldest Chinatown . 
- Mexican Independence in 1821 necessitated direct rule from Spain . Under direct Spanish rule , banking , industry and education flourished more than it had in the previous two centuries . The opening of the Suez Canal in 1869 facilitated direct trade and communications with Spain . 
- The growing wealth and education attracted indigenous , Chinese , Indians , Latinos , and Europeans from the provinces to Manila , all of whom elected a nascent Filipino citizenship regardless of ethnicity . The developments also facilitated the rise of an illustrado class which espoused liberal ideas , the ideological foundations of the Philippine Revolution which sought independence from Spain . 
- After the Battle of Manila ( 1898 ) , Spain ceded the surrendered city of Manila to the United States . The First Philippine Republic based at nearby Bulacan fought against the Americans for control of the city of Manila . The Americans defeated the First Philippine Republic and captured president Emilio Aguinaldo who announced allegiance to the United States on April 1 , 1901 . 
- Upon drafting a new charter for Manila in June 1901 , the Americans made official what had long been tacit : that the City of Manila was not Intramuros alone but also all its arrabales . The new city charter proclaimed that Manila was composed of eleven municipal districts — presumably Tondo , Binondo , Santa Cruz , Sampaloc , San Miguel , Pandacan , Santa Ana , Paco , Malate , Ermita and Intramuros . In addition to these , the Church recognized five parishes as Manileno — namely , Gagalangin , Trozo , Balic @-@ Balic , Santa Mesa and Singalong . Later times would add two more : Balut and San Andres Bukid . 
- Under American control , a new civilian oriented Insular Government headed by then Governor @-@ General William Howard Taft invited city planner Daniel Burnham for the transformation of Manila , to adapt the old city to changed times and modern needs . The Burnham Plan included development of the road system , the use of waterways for transportation , and beautification of Manila with the improvement of waterfronts , construction of parks , parkways and various building for various activities . 
- The latter included a government center occupying all of Wallace Field , which extends from Luneta to the present Taft Avenue . The Philippine Capitol was to rise at the Taft Avenue end of the field , facing toward the sea , and would form , with the buildings of different government bureaus and departments , a quadrangle , lagoon in the center , and a monument to José Rizal at its Luneta end . Of Burnham 's proposed government center , only three units — the Legislative Building and the building of the Finance and Agricultural departments — were completed when World War II erupted . 
- Due to the Japanese occupation of the Philippines , American soldiers were ordered to withdraw from the city and all military installations were removed on December 24 , 1941 . General Douglas MacArthur declared Manila an open city to prevent further death and destruction ; despite this , the Japanese warplanes continued to bomb the city . Manila was occupied by the Japanese forces on January 2 , 1942 . 
- Manila was also the site of the bloodiest battle in the Pacific theater during the Second World War . After falling to the Empire of Japan on January 2 , 1942 , it was recaptured by joint American and Filipino troops from February 3 to March 3 , 1945 . Some 100 @,@ 000 civilians were killed in Manila in February 1945 . It was the second most devastated city in the world after Warsaw during the Second World War . At the end of World War II , almost all of the structures in the city , particularly Intramuros , were destroyed but after the war , reconstruction took place . 
- In 1948 , President Elpidio Quirino moved the seat of government of the Philippines to Quezon City , a new capital city in the suburbs and fields northeast of Manila , created in 1938 by former President Manuel L. Quezon , which was named after him . The move ended any implementation of the Burnham Plan 's intent for the government centre to be at Luneta . 
- With the Visayan @-@ born Arsenio Lacson as its first elected mayor in 1952 ( all mayors were appointed prior to this ) , Manila underwent The Golden Age , once again earning its status as the " Pearl of the Orient " , a moniker it earned before the Second World War . After Lacson 's term in the 1950s , Manila was led by Antonio Villegas for most of the 1960s . Ramon Bagatsing ( an Indian @-@ Filipino ) was mayor for nearly the entire 1970s until the 1986 People Power Revolution . Mayors Lacson , Villegas , and Bagatsing are often collectively considered as the " Big Three of Manila " less for their rather long tenures as the city 's chief executive ( continuously for over three decades , from 1952 – 1986 ) , but more for their indelible contribution to the development and progress of the city and their lasting legacy in uplifting the quality of life and welfare of the people of Manila . 
- During the administration of President Ferdinand Marcos , the region of the Metro Manila was created as an integrated unit with the enactment of Presidential Decree No. 824 on November 7 , 1975 . The area encompassed four cities and thirteen adjoining towns , as a separate regional unit of government . On the 405th anniversary of the city 's foundation on June 24 , 1976 , Manila was reinstated by Marcos as the capital of the Philippines for its historical significance as the seat of government since the Spanish Period . Presidential Decree No. 940 states that Manila has always been to the Filipino people and in the eyes of the world , the premier city of the Philippines being the center of trade , commerce , education and culture . 
- During the martial law era , Manila became a hot @-@ bed of resistance activity as youth and student demonstrators repeatedly clashed with the police and military which were subservient to the Marcos regime . After decades of resistance , the non @-@ violent People Power Revolution ( predecessor to the peaceful @-@ revolutions that toppled the iron @-@ curtain in Europe ) , ousted the authoritarian Marcos from power . 
- In 1992 , Alfredo Lim was elected mayor , the first Chinese @-@ Filipino to hold the office . He was known for his anti @-@ crime crusades . Lim was succeeded by Lito Atienza , who served as his vice @-@ mayor . Atienza was known for his campaign ( and city slogan ) " Buhayin ang Maynila " ( Revive Manila ) , which saw the establishment of several parks and the repair and rehabilitation of the city 's deteriorating facilities . He was the city 's mayor for 3 terms ( 9 years ) before being termed out of office . 
- Alfredo Lim once again ran for mayor and defeated Atienza 's son Ali in the 2007 city election and immediately reversed all of Atienza 's projects claiming Atienza 's projects made little contribution to the improvements of the city . The relationship of both parties turned bitter , with the two pitting again during the 2010 city elections in which Lim won against Atienza . 
- Lim was sued by councilor Dennis Alcoreza on 2008 over human rights , charged with graft over the rehabilitation of public schools , and was heavily criticized for his haphazard resolution of the Rizal Park hostage taking incident , one of the deadliest hostage crisis in the Philippines . Later on , Vice Mayor Isko Moreno and 28 city councilors filed another case against Lim in 2012 , stating that Lim 's statement in a meeting were " life @-@ threatening " to them . On the 2013 elections , former President Joseph Estrada defeated Lim in the mayoral race . During his term , Estrada has paid the city 's over ₱ 5 billion debts , increase revenues by 2 @.@ 35 times from ₱ 6 @.@ 2 billion in 2012 to ₱ 14 @.@ 6 billion by 2016 , spent from 2013 to 2016 an unprecedented ₱ 6 @.@ 76 billion for the city 's infrastructure , built and or renovated seven city public markets , built 22 schools , increased teachers ' incomes , modernized the six city hospitals and bought dialysis machines and magnetic resonance imaging scanners , increased the efficiency of the police force and reduced crime . Manila has become the most competitive city in the Philippines by 2015 , making the city the best place for doing business and for living in . 
- Despite his achievements and unprecedented feat as the Mayor of Manila , however , Estrada was re @-@ elected as Manila mayor in the 2016 election against Lim and Amado Bagatsing , only winning by an infinitesimal 2 @,@ 830 votes . 
- 
- = = Geography = = 
- 
- Manila is located on the eastern shores of Manila bay , which rests on the western shores of Luzon . Manila lies 800 miles ( 1 @,@ 300 kilometers ) from mainland Asia . The Pasig River bisects Manila . 
- Almost all of Manila sits on top of centuries of prehistoric alluvial deposits built by the waters of the Pasig and on some land reclaimed from Manila Bay . Manila 's land has been altered substantially by human intervention , with considerable land reclamation along the waterfronts since the American colonial times . Some of the natural variations in topography have been evened out due to the urbanization of the city . As of 2013 , Manila has a total area of 42 @.@ 88 square kilometres ( 16 @.@ 56 sq mi ) . 
- 
- = = = Earthquakes = = = 
- 
- Manila sits astride the Pacific typhoon belt and is criss @-@ crossed by several fault lines . This led to Manila and its metropolitan region to be ranked as the second riskiest capital ( city ) to live in according to Swiss Re . The seismically active Marikina Valley Fault System poses a threat to Manila and the surrounding regions . 
- Manila has endured several deadly earthquakes , notably in 1645 and in 1677 which destroyed the stone and brick medieval city . The Earthquake Baroque style was used by the Colonial architects during the Spanish colonial period in order to adapt to the frequent earthquakes . 
- 
- = = = Climate = = = 
- 
- Under the Köppen climate classification system , Manila features a tropical savanna climate ( Köppen climate classification Aw ) . Together with the rest of the Philippines , Manila lies entirely within the tropics . Its proximity to the equator means that the temperature range is very small , rarely going below 20 ° C ( 68 ° F ) or above 38 ° C ( 100 ° F ) . Temperature extremes have ranged from 14 @.@ 5 ° C ( 58 @.@ 1 ° F ) on January 11 , 1914 to 38 @.@ 6 ° C ( 101 @.@ 5 ° F ) on May 7 , 1915 . 
- Humidity levels are usually very high all year round . Manila has a distinct dry season from December through May , and a relatively lengthy wet season that covers the remaining period with slightly cooler temperatures . In the rainy season it rarely rains all day but the rainfall is very heavy during short periods . Typhoons usually occur from June to September . 
- 
- = = = Environment = = = 
- 
- Due to industrial waste and automobiles , Manila suffers from air pollution , affecting 98 % of the population . Annually , the air pollution causes more than 4 @,@ 000 deaths . Ermita is Manila 's most air polluted district due to open dump sites and industrial waste . According to a report in 2003 , The Pasig River is one of the most polluted rivers in the world with 150 tons of domestic waste and 75 tons of industrial waste dumped daily . 
- Annually , Manila is hit with 6 to 7 typhoons creating floods . In 2009 , Typhoon Ketsana struck Philippines . In the aftermath of Typhoon Ketsana , the lack of infrastructure led to one of the worst floodings in the Philippines and creating a significant amount of pollution . Following the aftermath of Typhoon Ketsana , the city began to dredge its rivers and improve its drainage network . The Pasig River Rehabilitation Commission is in charge of cleaning up the Pasig River and tributaries for transportation , recreation and tourism purposes . Rehabilitation efforts have resulted in the creation of parks along the riverside , along with stricter pollution controls . 
- 
- = = Cityscape = = 
- 
- 
- = = = Architecture = = = 
- 
- Manila has architecturally significant buildings in a wide range of styles spanning distinct historical and cultural periods . Architectural styles reflect American , Spanish , Chinese , and Malay influences . Prominet Filipino architects such as Antonio Toledo , Felipe Roxas , Juan M. Arellano and Tomás Mapúa have designed significant buildings in Manila such as churches , government offices , theaters , mansions , schools and universities . 
- Manila is known for its distinct Art Deco theaters which are designed by National Artists such as Juan Nakpil and Pablo Antonio . The historic Escolta Street in Binondo features many buildings of neo @-@ classical and beaux @-@ arts architectural style , many of which were designed by prominent Filipino architects during the American Rule in the 1920s to the late 1930s . Many architects , artists , historians and heritage advocacy groups are pushing for the revival of Escolta Street , which was once the premier street of the Philippines . 
- Unfortunately , much of Manila 's prewar and Spanish colonial architecture was destroyed during World War II . Reconstruction took place afterwards , replacing the destroyed historic Spanish @-@ era buildings with modern ones , erasing much of the city 's character . Some buildings destroyed by the war have been reconstructed , such as the Old Legislative Building ( National Museum ) , Ayuntamiento de Manila ( Bureau of the Treasury ) and the currently under construction San Ignacio Church ( Museo de Intramuros ) . Plans have been laid out to rehabilitate several neglected historic buildings and places such as Plaza Del Carmen , San Sebastian Church and the Manila Metropolitan Theater and soon Spanish @-@ era shops and houses in Quiapo , Binondo , and San Nicolas will be restored to its former splendor , as a part of a movement to restore Manila to its former glory . 
- Since Manila is prone to earthquakes , the Spanish colonial architects invented the style called Earthquake Baroque which the churches and government buildings during the Spanish colonial period adopted . As a result , succeeding earthquakes of the 18th and 19th centuries barely affected Manila , although it did periodically level the surrounding area . Modern buildings in and around Manila are designed or have retrofitted to withstand an 8 @.@ 2 magnitude quake in accordance to the country 's building code . 
- 
- = = = Barangays and districts = = = 
- 
- Manila is composed of fourteen districts according to Republic Act No. 409 , otherwise known as the Revised Charter of the City of Manila . Two were later added , which are Santa Mesa ( partitioned off from Sampaloc ) and San Andres ( partitioned off from Santa Ana ) . 
- The city has 896 barangays that are known by sequential numbers instead of names . These barangays are further group into 100 zones for administrative and municipal purposes . 
- 
- = = = Military and national security = = = 
- 
- The headquarters of the Philippine Coast Guard is located at the South Harbor in Port Area near Intramuros and Ermita . The Philippine Navy on the other hand has its headquarters in Naval Station Jose Andrada located along Roxas Boulevard in Malate . Furthermore , the AFP Joint Task Force @-@ National Capital Region was created in 2012 to ensure peace and stability in Metro Manila , of which Manila is a part . It bears the same functions of the deactivated National Capital Regional Command , although it operates on a much smaller size than its predecessor . 
- 
- = = = Slums = = = 
- 
- There are an estimated 4 million slum dwellers living in Manila as of 2014 . 
- 
- = = Demographics = = 
- 
- According to the 2015 census , the population of the city was 1 @,@ 780 @,@ 148 , making it the second most populous city in the Philippines . 
- Manila is the most densely populated city in the world with 43 @,@ 079 inhabitants per km2 . District 6 is listed as being the most dense with 68 @,@ 266 inhabitants per km2 , followed by District 1 with 64 @,@ 936 and District 2 with 64 @,@ 710 , respectively . District 5 is the least densely populated area with 19 @,@ 235 . 
- Manila 's population density dwarfs that of Kolkata ( 27 @,@ 774 inhabitants per km2 ) , Mumbai ( 22 @,@ 937 inhabitants per km2 ) , Paris ( 20 @,@ 164 inhabitants per km2 ) , Dhaka ( 19 @,@ 447 inhabitants per km2 ) , Shanghai ( 16 @,@ 364 inhabitants per km2 , with its most dense district , Nanshi , having a density of 56 @,@ 785 inhabitants per km2 ) , and Tokyo ( 10 @,@ 087 inhabitants per km2 ) . 
- The vernacular language is Filipino , based mostly on the Tagalog of surrounding areas , and this Manila form of speaking Tagalog has essentially become the lingua franca of the Philippines , having spread throughout the archipelago through mass media and entertainment . Meanwhile , English is the language most widely used in education , business , and heavily in everyday usage throughout the Metro Manila region and the Philippines itself . 
- A number of older residents can still speak basic Spanish , which used to be a mandatory subject in the curriculum of Philippine universities and colleges , and many children of Japanese Filipino , Indian Filipino , and other migrants or expatriates also speak their parents ' languages at home , aside from English and / or Filipino for everyday use . Minnan Chinese ( known as Lannang @-@ oe ) is spoken by the city 's Chinese @-@ Filipino community . 
- 
- = = Economy = = 
- 
- The city is a major center for commerce , banking and finance , retailing , transportation , tourism , real estate , new media as well as traditional media , advertising , legal services , accountancy , insurance , theater , fashion , and the arts in the Philippines . 
- The Cities and Municipalities Competitiveness Index , published by the National Competitiveness Council of the Philippines , ranks the cities , municipalities and provinces of the country according to their economic dynamism , government efficiency and infrastructure . Manila placed third in the Highly Urbanized City ( HUC ) category . Previously , Manila was the country 's most competitive city in 2015 , making it the best place to live in and do business . 
- The Port of Manila is the largest seaport in the Philippines , making it the premier international shipping gateway to the country . The Philippine Ports Authority is government agency responsible to oversee the operation and management of the ports . The International Container Terminal Services Inc. cited by the Asian Development Bank as one of the top five major maritime terminal operators in the world has its headquarters and main operations on the ports of Manila . Another port operator , the Asian Terminal Incorporated , has its corporate office and main operations in the Manila South Harbor and its container depository located in Santa Mesa . 
- Binondo , the oldest and one of the largest Chinatown in the world , was the center of commerce and business activities in the city . Numerous residential and office skyscrapers are found within its medieval streets . Plans to make the Chinatown area into a business process outsourcing ( BPO ) hub progresses and is aggressively pursued by the city government of Manila . 30 buildings are already identified to be converted into BPO offices . These buildings are mostly located along the Escolta Street of Binondo , which are all unoccupied and can be converted into offices . 
- Divisoria in Tondo is dubbed as the " shopping mecca of the Philippines " . Numerous shopping malls are located in this place , which sells products and goods at bargain price . Small vendors occupies several roads that causes pedestrian and vehicular traffic . A famous landmark in Divisoria is the Tutuban Center , a large shopping mall that is a part of the Philippine National Railways ' Main Station . It attracts 1 million people every month , but is expected to add another 400 @,@ 000 people when the LRT @-@ 2 West Extension is constructed , making it Manila 's busiest transfer station . 
- Diverse manufacturers within the city produce industrial @-@ related products such as chemicals , textiles , clothing , and electronic goods . Food and beverages and tobacco products also produced . Local entrepreneurs continue to process primary commodities for export , including rope , plywood , refined sugar , copra , and coconut oil . The food @-@ processing industry is one of the most stable major manufacturing sector in the city . 
- The Pandacan Oil Depot houses the storage facilities and distribution terminals of the three major players in the country 's petroleum industry , namely Caltex Philippines , Pilipinas Shell and Petron Corporation . The oil depot has been a subject of various concerns , including its environmental and health impact to the residents of Manila . The Supreme Court has ordered that the oil depot to be relocated outside the city by July 2015 , but it failed to meet this deadline . It is currently being demolished which is expected to be finished before the year 2016 ends , and plans have been set up to turn this 33 hectare facility into a transport hub or even a food park . 
- Manila is a major publishing center in the Philippines . Manila Bulletin , the Philippines ' largest broadsheet newspaper by circulation , is headquartered inside Intramuros . Other major publishing companies in the country like The Manila Times , The Philippine Star and Manila Standard Today are headquartered inside the Port Area . The Chinese Commercial News , the Philippines ' oldest existing Chinese @-@ language newspaper , and the country 's third @-@ oldest existing newspaper is headquartered in Binondo . 
- Manila serves as the headquarters of the Central Bank of the Philippines which is located along Roxas Boulevard . Some universal banks in the Philippines that has its headquarters in the city are the Landbank of the Philippines and Philippine Trust Company . Philam Life Insurance Company , currently the largest life insurance company in the Philippines in terms of assets , net worth , investment and paid @-@ up capital , has its headquarters along United Nations Avenue in Ermita . Unilever Philippines has its corporate office along United Nations Avenue in Paco . Toyota , a company listed in the Forbes Global 2000 also has its regional office along UN Avenue . 
- 
- = = = Tourism = = = 
- 
- Tourism is a vital industry in Manila , and it welcomes approximately over 1 million tourists each year . Major destinations include the walled city of Intramuros , the National Theater at the Cultural Center of the Philippines , Manila Ocean Park , Binondo , Ermita , Malate , Manila Zoo , National Museum of the Philippines and Rizal Park . 
- Rizal Park , also known as Luneta Park , is the national park of the country and has an area of 58 hectares ( 140 acres ) , making it the largest urban park in Asia . In the Tourism Act of 2009 , Rizal Park along with Intramuros are designated as flagship destination to become a tourism enterprise zone . A new attraction called Paseo de Manila is expected to rise in the park . The park was constructed as an honor and dedication to the country 's national hero José Rizal , who was executed by the Spaniards on charges of subversion . The flagpole west of the Rizal Monument is the Kilometer Zero marker for distances to the rest of the country . 
- Intramuros is the historic center of Manila . Originally , it was considered to be Manila itself at the time when the Philippines was under the Spanish Empire colonial rule . Owing to its history and cultural value , Intramuros and Rizal Park are designated as flagship destination to become a tourism enterprise zone in the Tourism Act of 2009 . Intramuros is managed by the Intramuros Administration ( IA ) . 
- The architecture of Intramuros reflects the Spanish colonial style and the American neoclassical architectural style , since the Philippines was a colony of Spain and the United States before it is granted its independence in 1946 . Kalesa is a popular mode of transportation in Intramuros and nearby places such as Binondo , Ermita and the Rizal Park . 
- Popular tourist destinations in Intramuros include the Baluarte de San Diego , Club Intramuros Golf Course , Cuartel de Santa Lucia , Fort Santiago , Manila Cathedral , Palacio Arzobispal , Palacio de Santa Potenciana , Palacio del Gobernador , Plaza Mexico , Plaza de Roma , San Agustin Church and the Ayuntamiento de Manila . 
- Some of the country 's oldest schools are founded in Intramuros , these are the University of Santo Tomas ( 1611 ) , Colegio de San Juan de Letran ( 1620 ) , and Ateneo de Manila University ( 1859 ) . Only Colegio de San Juan de Letran remains at Intramuros ; the University of Santo Tomas transferred to a new campus at Sampaloc in 1927 , and Ateneo left Intramuros for Loyola Heights , Quezon City ( while still retaining " de Manila " in its name ) in 1952 . Other prominent educational institutions include the Manila High School and the University of the City of Manila . 
- The Department of Tourism designates Manila as the pioneer of medical tourism , expecting it to generate $ 1 billion in revenue annually . However , lack of progressive health system , inadequate infrastructure and the unstable political environment are seen as hindrances for its growth . 
- 
- = = = Shopping centers = = = 
- 
- Manila is a well @-@ known shopping hub of the country and it has been named as one of the best shopping destinations in Asia . Major shopping malls , markets and bazaars thrives in Manila . 
- Robinsons Place Manila is the largest shopping mall in the city . The mall was the second and by @-@ far , the largest Robinson Mall ever built by John Gokongwei . SM Supermall maintains presence in the city . One of their shopping mall is the SM City Manila , the first SM Supermall in the city featuring major SM brands like The SM Store , SM Supermarket , SM Cinemas and SM Foodcourt . It is located right beside the Manila City Hall . SM City San Lazaro is the second SM Supermall in Manila . It is located in Santa Cruz . SM City San Lazaro was constructed on the site of the former San Lazaro Hippodrome . The building of the former Manila Royal Hotel in Quiapo which is famed for its revolving restaurant atop is now the SM Clearance Center which was established in 1972 . The site of the first SM Store is located at Carlos Palanca Sr. ( formerly Echague ) Street in San Miguel . 
- Quiapo is referred as the " Old Downtown " where tiangges , markets , botique shops , music and electronics stores are common . C.M. Recto Avenue is where lots of department stores are located . One of Recto Avenue 's famous destination is Divisoria , home to numerous shopping malls in the city . It is also dubbed as the shopping mecca of the Philippines where everything is sold at bargain price . Binondo , the oldest Chinatown in the world , is the city 's center of commerce and trade for all types of businesses run by Filipino @-@ Chinese merchants with a wide variety of Chinese and Filipino shops and restaurants . 
- 
- = = Arts , culture and religion = = 
- 
- 
- = = = Religion = = = 
- 
- 
- = = = = Christianity = = = = 
- 
- As a result of Spanish cultural influence , Manila is a predominantly Christian ( Catholic ) city . As of 2010 , Roman Catholics comprises 83 @.@ 5 % of the population , followed by adherents of the Philippine Independent Church ( 2 @.@ 4 % ) ; Iglesia ni Cristo ( 1 @.@ 9 % ) ; various Protestant churches ( 1 @.@ 8 % ) ; and Buddhists ( 1 @.@ 1 % ) . Members of Islam and other religions comprises the remaining 10 @.@ 4 % of the city 's population . 
- Manila is the site of prominent Catholic churches and institutions . The Manila Cathedral is the seat of the Roman Catholic Archdiocese of Manila and the oldest established church in the country . Aside from the Manila Cathedral , there are also three other basilicas in the city : Quiapo Church , Binondo Church , and the Minor Basilica of San Sebastián . The San Agustín Church in Intramuros is a UNESCO World Heritage Site and is one of the two fully air @-@ conditioned Catholic churches in the city . Manila also has other parishes located throughout the city , with some of them dating back to the Spanish Colonial Period when the city serves as the base for numerous Catholic missions both within the Philippines and to Asia beyond . 
- Several Mainline Protestant denominations are headquartered in the city . St. Stephen 's Parish pro @-@ cathedral in the Sta . Cruz district is the see of the Episcopal Church in the Philippines ' Diocese of Central Philippines , while align Taft Avenue are the main cathedral and central offices of the Iglesia Filipina Independiente ( also called the Aglipayan Church , a national church that was a product of the Philippine Revolution ) . Other faiths like The Church of Jesus Christ of Latter @-@ day Saints maintains a presence in the city . 
- The indigenous Iglesia ni Cristo has several locales ( akin to parishes ) in the city , including its very first chapel ( now a museum ) in Punta , Sta . Ana . Evangelical , Pentecostal and Seventh @-@ day Adventist denominations also thrives within the city . The headquarters of the Philippine Bible Society is in Manila . Also , the main campus of the Cathedral of Praise is located along Taft Avenue . Jesus Is Lord Church also has several branches and campuses in Manila , and celebrates its anniversary yearly at the Burnham Green and Quirino Grandstand in Rizal Park . 
- 
- = = = = Other faiths = = = = 
- 
- The city also hosts other religions . There are many Buddhist and Taoist temples serving the Chinese Filipino community . Quiapo is home to a sizable Muslim population which worships at Masjid Al @-@ Dahab . Members of the Indian expatriate population have the option of worshiping at the large Hindu temple in the city , or at the Sikh gurdwara along United Nations Avenue . The National Spiritual Assembly of the Bahá 'ís of the Philippines , the governing body of the Filipino Bahá 'í community , is headquartered near Manila 's eastern border with Makati . 
- 
- = = = Annual cultural events and religious festivities = = = 
- 
- Manila celebrates civic and national holidays . Manila Day , which celebrates the city 's founding on June 24 , 1571 , was first proclaimed by Herminio A. Astorga ( then Vice Mayor of Manila ) on June 24 , 1962 and has been annually commemorated , under the patronage of John the Baptist . Locally , each of the city 's barangays also have their own festivities guided by their own patron saint . The city is also the host to the Feast of the Black Nazarene , held every January 9 , which draws millions of Catholic devotees . Another religious feasts held in Manila was the Feast of the Nuestra Señora de los Desamparados de Manila ( Our Lady of the Abandoned ) , the patron saint of Santa Ana and was held every May 12 . Non @-@ religious holidays include the New Year 's Day , National Heroes ' Day , Bonifacio Day and Rizal Day . 
- 
- = = = Museums and art galleries = = = 
- 
- As the cultural center of the Philippines , Manila is the home to a number of museums . The National Museum of the Philippines Complex , which include the National Museum of Fine Arts , Museum of Anthropology and the Museum of Natural History , is located on the northeast part of Rizal Park facing Taft Avenue. proposed national government center during the American time . Museums established by educational institutions include the Mabini Shrine , the Museum of Contemporary Art and Design , UST Museum of Arts and Sciences , and the UP Museum of a History of Ideas . 
- Bahay Tsinoy , one of Manila 's most prominent museums , documents the Chinese lives and contributions in the history of the Philippines . The Intramuros Light and Sound Museum chronicles the Filipinos desire for freedom during the revolution under Rizal 's leadership and other revolutionary leaders . The Metropolitan Museum of Manila exhibits the Filipino arts and culture . 
- Other museums in the city are the Museum of Manila , the city @-@ owned museum that exhibits the city 's culture and history , Museo Pambata , a children 's museum , the Museum of Philippine Political History , which exhibits notable political events in the country , the Parish of the Our Lady of the Abandoned and the San Agustin Church Museum , which houses religious artifacts , and Plaza San Luis , a public museum . 
- 
- = = Sports = = 
- 
- Sports in Manila have a long and distinguished history . The city 's , and in general the country 's main sport is basketball , and most barangays have a makeshift basketball court , with court markings drawn on the streets . Larger barangays have covered courts where interbarangay leagues are held every summer ( April to May ) . 
- The city has several well @-@ known sports venues , such as the Rizal Memorial Sports Complex and San Andres Gym , the home of the now defunct Manila Metrostars . The Rizal Memorial Sports Complex houses the Rizal Memorial Track and Football Stadium , the Baseball Stadium , Tennis Courts , Memorial Coliseum and the Ninoy Aquino Stadium ( the latter two are indoor arenas ) . 
- The Rizal complex had hosted several multi @-@ sport events , such as the 1954 Asian Games and the 1934 Far Eastern Games . Whenever the country hosts the Southeast Asian Games , most of the events are held at the complex , but in the 2005 Games , most events were held elsewhere . The 1960 ABC Championship and the 1973 ABC Championship , forerunners of the FIBA Asia Championship , was hosted by the complex , with the national basketball team winning on both tournaments . The 1978 FIBA World Championship was held at the complex although the latter stages were held in the Araneta Coliseum in Quezon City , Southeast Asia 's largest indoor arena at that time . 
- Manila also hosts several well @-@ known sports facilities such as the Enrique M. Razon Sports Center and the University of Santo Tomas Sports Complex , both of which are private venues owned by a university ; collegiate sports are also held , with the University Athletic Association of the Philippines and the National Collegiate Athletic Association basketball games held at Rizal Memorial Coliseum and Ninoy Aquino Stadium , although basketball events had transferred to San Juan 's Filoil Flying V Arena and the Araneta Coliseum in Quezon City . Other collegiate sports are still held at the Rizal Memorial Sports Complex . Professional basketball also used to play at the city , but the Philippine Basketball Association now holds their games at Araneta Coliseum and Cuneta Astrodome at Pasay ; the now defunct Philippine Basketball League played some of their games at the Rizal Memorial Sports Complex . 
- The Manila Storm are the city 's rugby league team training at Rizal Park ( Luneta Park ) and playing their matches at Southern Plains Field , Calamba , Laguna . 
- Previously a widely played sport in the city , Manila is now the home of the only sizable baseball stadium in the country , at the Rizal Memorial Baseball Stadium . The stadium hosts games of Baseball Philippines ; Lou Gehrig and Babe Ruth were the first players to score a home run at the stadium at their tour of the country on December 2 , 1934 . 
- Another popular sport in the city are cue sports , and billiard halls are a feature in most barangays . The 2010 World Cup of Pool was held at Robinsons Place Manila . 
- The Rizal Memorial Track and Football Stadium hosted the first FIFA World Cup qualifier in decades when the Philippines hosted Sri Lanka in July 2011 . The stadium , which was previously unfit for international matches , had undergone a major renovation program prior to the match . The Football Stadium now regularly hosts matches of the United Football League . The stadium also hosted its first rugby test when it hosted the 2012 Asian Five Nations Division I tournaments . 
- 
- = = Government and politics = = 
- 
- The government of Manila is divided into three branches : executive , legislative and judiciary . The judicial branch is administered solely by the Supreme Court of the Philippines under the Metro Manila judicial region . The city government have control of the executive and legislative branch . Manila employs 11 @,@ 919 personnel at the end of 2014 . 
- The current Mayor of Manila is Joseph Estrada , who served as the President of the Philippines from 1998 @-@ 2001 . He is also the head of the executive department of the city . The legislative arm which is composed of six elected city councilors , is headed by the Vice Mayor . Former actor Isko Moreno currently serves as the city 's vice mayor . Altogether they are assisted by the Manila City Council , the local President of the Association of Barangay Captains , and the President of the Sangguniang Kabataan . Their offices are located at the Manila City Hall . 
- 
- = = = Finance = = = 
- 
- On September 25 , 2014 , the Commission on Audit released its 2013 Annual Financial Report citing the city 's income at ₱ 10 @.@ 1 billion with an asset worth of ₱ 18 @.@ 6 billion . Its local income stood at ₱ 5 @.@ 41 billion and its national government allocation was ₱ 1 @.@ 74 billion , having an annual regular income ( ARI ) of an estimated ₱ 7 @.@ 15 billion . Manila 's net income stood at ₱ 3 @.@ 54 billion in 2014 . 
- Among the local government units , Manila has the highest budget allocation to health . It was also one of the cities with the highest tax and internal revenue . Tax revenue accounts for 46 % of the city 's income in 2012 . 
- 
- = = = Districts and barangays = = = 
- 
- Manila has six legislative districts that serve as the constituencies for the election of the city 's representatives to the lower house of the Congress of the Philippines and of the regular members to the Sangguniang Panlungsod ( SP ; City Council ) . Each district elects one representative to the House of Representatives and six SP members to the council . The city , along with the rest of the nation , elects 12 senators as one at @-@ large district . 
- Manila is politically divided into 896 barangays , the smallest unit of local government in the Philippines . Each barangay has its own chairperson and councilors . For administrative convenience , all the barangays in Manila are grouped into 100 zones . These zones have no form of local government . 
- The 1st District ( 2015 population : 415 @,@ 906 ) is Manila 's ( and the country 's ) most densely populated congressional district . It covers the western portion of Tondo that lies along Manila Bay . 
- The 2nd District ( 2015 population : 215 @,@ 457 ) covers the eastern inland portion of Tondo , a neighborhood or sub @-@ district known as Gagalangin . 
- The 3rd District ( 2015 population : 197 @,@ 242 ) covers Binondo , Quiapo , San Nicolas and Santa Cruz . 
- The 4th District ( 2015 population : 265 @,@ 046 ) covers Sampaloc . 
- The 5th District ( 2015 population : 366 @,@ 714 ) covers Ermita , Malate , Port Area , Intramuros , San Andres Bukid , and a portion of Paco ( except Zone 90 ) . 
- The 6th District ( 2007 population : 295 @,@ 245 ) covers Paco ( Zone 90 only ) , Pandacan , San Miguel , Santa Ana and Santa Mesa . 
- Manila has the most number of barangays of any city or municipality in the Philippines . Attempts at reducing its number have not prospered despite local legislation — Ordinance 7907 , passed on 23 April 1996 — reducing the number from 897 to 150 by merging existing barangays , because of the failure to hold a plebiscite . 
- 
- = = = National government = = = 
- 
- Manila , being the seat of political power of the Philippines , has several national government offices headquartered at the city . Planning for the development for being the center of government started during the early years of American colonization to the country when they envisioned a well @-@ designed city outside the walls of Intramuros . The strategic location chosen was Bagumbayan , a former town which is now the Rizal Park to become the center of government and a design commission was given to Daniel Burnham to create a master plan for the city patterned after Washington D.C .. These improvements were eventually abandoned under the Commonwealth Government of Manuel L. Quezon . 
- A new government center was to be built on the hills northeast of Manila , or what is now Quezon City . Several government agencies have set up their headquarters in Quezon City but several key government offices still reside in Manila . However , many of the plans were substantially altered after the devastation of Manila during World War II and by subsequent administrations . 
- The city , as the capital , still hosts the Office of the President , as well as the president 's official residence . Aside from these , important institutions such as the Supreme Court , the Court of Appeals , the Bangko Sentral ng Pilipinas , the Departments of Budget and Management , Finance , Health , Justice , Labor and Employment and Public Works and Highways still call the city home . Manila also hosts important national institutions such as the National Library , National Archives , National Museum and the Philippine General Hospital . 
- Congress previously held office at the Old Congress Building . In 1972 , due to declaration of martial law , Congress was dissolved ; its successor , the unicameral Batasang Pambansa , held office at the new Batasang Pambansa Complex . When a new constitution restored the bicameral Congress , the House of Representatives stayed at the Batasang Pambansa Complex , while the Senate remained at the Old Congress Building . In May 1997 , the Senate transferred to a new building it shares with the Government Service Insurance System at reclaimed land at Pasay . 
- 
- = = Infrastructure = = 
- 
- 
- = = = Utilities = = = 
- 
- 
- = = = = Water and electricity = = = = 
- 
- Water services used to be provided by the Metropolitan Waterworks and Sewerage System , which served 30 % of the city with most other sewage being directly dumped into storm drains , septic tanks , or open canals . MWSS was privatized in 1997 which split the water concession into the east and west zones . The Maynilad Water Services took over the west zone of which Manila is a part . It now provides the supply and delivery of potable water and sewerage system in Manila , but it does not provide service to the southeastern part of the city which belongs to the east zone that is served by Manila Water . Electric services are provided by Meralco , the sole electric power distributor in Metro Manila . 
- 
- = = = Transportation = = = 
- 
- One of the more famous modes of transportation in Manila is the jeepney . Patterned after U.S. army jeeps , these have been in use since the years immediately following World War II . The Tamaraw FX , the third generation Toyota Kijang , which competed directly with jeepneys and followed fixed routes for a set price , once plied the streets of Manila . 
- On a for @-@ hire basis , the city is served by numerous taxicabs , " tricycles " ( motorcycles with sidecars , the Philippine version of the auto rickshaw ) , and " trisikads " or " sikads " ( bicycles with a sidecars , the Philippine version of pedicabs ) . In some areas , especially in Divisoria , motorized pedicabs are popular . Spanish @-@ era horse @-@ drawn calesas are still a popular tourist attraction and mode of transportation in the streets of Binondo and Intramuros . All types of public road transport are privately owned and operated under government franchise . 
- The city is serviced by the LRT @-@ 1 and LRT @-@ 2 which forms the LRTA system , as distinct from the MRT @-@ 3 which is under the MRTC system that services other parts of Metro Manila . Development of the railway system began in the 1970s under the Marcos administration , making it the first light rail transport in Southeast Asia . These systems are currently undergoing a multibillion @-@ dollar expansion . LRT Line 1 runs along the length of Taft Avenue ( R @-@ 2 ) and Rizal Avenue ( R @-@ 9 ) , and the LRT Line 2 runs along Claro M. Recto Avenue ( C @-@ 1 ) and Ramon Magsaysay Boulevard ( R @-@ 6 ) from Santa Cruz , through Quezon City , up to Santolan in Marikina . 
- The main terminal of the Philippine National Railways lies within the city . One commuter railway within Metro Manila is in operation . The line runs in a general north @-@ south direction from Tutuban ( Tondo ) toward Laguna . The Port of Manila , located in the vicinity of Manila Bay is the chief seaport of the Philippines . The Pasig River Ferry Service which runs on the Pasig River is another form of transportation . The city is also served by the Ninoy Aquino International Airport and Clark International Airport . 
- In 2006 , Forbes magazine ranked Manila " the world 's most congested city " . Manila has become notorious for its frequent traffic jams and high densities . The government has undertaken several projects to alleviate the traffic in the city . Some of the projects include : the construction of a new flyover at Sampaloc , the construction of the Metro Manila Skyway Stage 3 , the proposed LRT Line 2 ( west ) extension from Recto to Tondo or the Port Area , and the expansion of several national and local roads . However , such projects have yet to make any meaningful impact , and the traffic jams and congestion continue unabated . The urban planning of the Manila and the whole metropolis was based on the Metro Manila Dream Plan , which seeks to address the problems of Metro Manila 's urban planning and transportation . It consists of a list of short term priority projects and medium to long term infrastructure projects that will last up to 2030 . 
- 
- = = Healthcare = = 
- 
- The Manila Health Department is responsible for the planning and implementation of the health care programs provided by the city government . It operates 59 health centers and six city @-@ run hospitals , which are free of charge . The six public city @-@ run hospitals are the Ospital ng Maynila Medical Center , Ospital ng Sampaloc , Gat Andres Bonifacio Memorial Medical Center , Ospital ng Tondo , Sta . Ana Hospital , and Justice Jose Abad Santos General Hospital . Manila is also the site of the Philippine General Hospital , the tertiary state @-@ owned hospital administered and operated by the University of the Philippines Manila . 
- Manila 's healthcare is also provided by private corporations . Private hospitals that operates in the city are the Manila Doctors Hospital , Chinese General Hospital and Medical Center , Dr. José R. Reyes Memorial Medical Center , Metropolitan Medical Center , Our Lady of Lourdes Hospital , and the University of Santo Tomas Hospital . 
- The Department of Health has its main office in Manila . The national health department also operates the San Lazaro Hospital , a special referral tertiary hospital . Manila is also the home to the headquarters of the World Health Organization Regional Office for the Western Pacific and the World Health Organization Country Office for the Philippines . 
- 
- = = Education = = 
- 
- The center of education since the colonial period , Manila — particularly Intramuros — is home to several Philippine universities and colleges as well as its oldest ones . It served as the home of the University of Santo Tomas ( 1611 ) , Colegio de San Juan de Letran ( 1620 ) , Ateneo de Manila University ( 1859 ) , Lyceum of the Philippines University and the Mapua Institute of Technology . Only Colegio de San Juan de Letran ( 1620 ) remains at Intramuros ; the University of Santo Tomas transferred to a new campus at Sampaloc in 1927 , and Ateneo left Intramuros for Loyola Heights , Quezon City ( while still retaining " de Manila " in its name ) in 1952 . 
- The University of the City of Manila ( Pamantasan ng Lungsod ng Maynila ) located at Intramuros , and Universidad de Manila located just outside the walled city , are both owned and operated by the Manila city government . The national government controls the University of the Philippines Manila , the oldest of the University of the Philippines constituent universities and the center of health sciences education in the country . The city is also the site of the Polytechnic University of the Philippines , the largest university in the country in terms of student population . 
- The University Belt refers to the area where there is a high concentration or a cluster of colleges and universities in the city and it is commonly understood as the one where the San Miguel , Quiapo and Sampaloc districts meet . Generally , it includes the western end of España Boulevard , Nicanor Reyes St. ( formerly Morayta St. ) , the eastern end of Claro M. Recto Avenue ( formerly Azcarraga ) , Legarda Avenue , Mendiola Street , and the different side streets . Each of the colleges and universities found here are at a short walking distance of each other . Another cluster of colleges lies along the southern bank of the Pasig River , mostly at the Intramuros and Ermita districts , and still a smaller cluster is found at the southernmost part of Malate near the border with Pasay such as the private co @-@ educational institution of De La Salle University , the largest of all De La Salle University System of schools . 
- The Division of the City Schools of Manila , a branch of the Department of Education , refers to the city 's three @-@ tier public education system . It governs the 71 public elementary schools , 32 public high schools . 
- The city also contains the Manila Science High School , the pilot science high school of the Philippines ; the National Museum , where the Spoliarium of Juan Luna is housed ; the Metropolitan Museum of Manila , a museum of modern and contemporary visual arts ; the Museo Pambata , the Children 's Museum , a place of hands @-@ on discovery and fun learning ; and , the National Library , the repository of the country 's printed and recorded cultural heritage and other literary and information resources . 
- 
- = = Global outreach = = 
- 
- 
- = = = Twin towns – Sister cities = = = 
- 
- Sister cities of Manila 
- 
- = = = = Asia / Pacific Rim = = = = 
- 
- 
- = = = = North America = = = = 
- 
- 
- 
- = Attalea ( palm ) = 
- 
- Attalea is a large genus of palms native to Mexico , the Caribbean , Central and South America . This pinnately leaved , non @-@ spiny genus includes both small palms lacking an aboveground stem and large trees . The genus has a complicated taxonomic history , and has often been split into four or five genera based on differences in the male flowers . Since the genera can only be distinguished on the basis of their male flowers , the existence of intermediate flower types and the existence of hybrids between different genera has been used as an argument for keeping them all in the same genus . This has been supported by a recent molecular phylogeny . 
- Somewhere between 29 and 67 species are recognised in the genus , with estimates of as many as 100 . Incomplete herbarium collections make it difficult to determine whether certain groups represent single species , or groups of similar species . Attalea species have a long history of human use , and include economically important sources of palm oil and fibre . Many species are fire tolerant and thrive in disturbed habitats . Their seeds are animal dispersed , including some which are thought to have been adapted for dispersal by now @-@ extinct Pleistocene megafauna . 
- 
- = = Description = = 
- 
- The genus Attalea has pinnately compound leaves — rows of leaflets emerge on either side of the axis of the leaf in a feather @-@ like or fern @-@ like pattern . Species are also non @-@ spiny palms and includes both large trees with stout stems up to 30 metres ( 98 ft ) tall and acaulescent palms ( one which lack an aboveground stem ) . The number of leaves per individual varies from about three to thirty @-@ five ; larger plants tend to have more and longer leaves . 
- Inflorescences are large , branched and borne among the leaves . The inflorescence consists of a main axis — the peduncle and the rachis — and a series of smaller branches , the rachillae . The rachillae , which bear the flowers , emerge from the rachis . The peduncle is the main stalk , connecting the rachis with the stem . Inflorescences either consist entirely of male flowers , or are predominantly female with a few male flowers . Fruit usually have two or three seeds , although fewer or more are present in some species , and are usually brown , yellow , orange @-@ brown or purple when mature . 
- Four different types of male flowers exist . On the basis of these flower types , the genus has often been split into four genera — a more narrowly defined Attalea , Orbignya , Maximiliana and Scheelea . The species sometimes referred to Orbignya have coiled anthers , while the other groups have straight ones . The petals of those placed in Maximiliana are much shorter than the stamens , while those placed in Scheelea and a more narrowly defined Attalea have petals that are longer than the stamens . Five species do not fit easily into any of these groups ; this fact has been used as an argument in favour of considering this group a single genus . 
- 
- = = Taxonomy = = 
- 
- Attalea has been placed in the subfamily Arecoideae , the tribe Cocoseae and the subtribe Attaleinae , together with the genera Allagoptera , Beccariophoenix , Butia , Cocos , Jubaea , Jubaeopsis , Lytocaryum , Parajubaea , Syagrus and Voanioala . Within this subtribe , Attalea has been found to be a monophyletic group , and sister to the clade containing Allagoptera , Polyandrococos , Parajubaea , Butia and Jubaea . 
- Disagreement exists as to whether Attalea should be considered a single genus , or a group of related genera . In their 1996 Field Guide to the Palms of the Americas , Andrew Henderson , Gloria Galeano and Rodrigo Bernal combined all the species in the subtribe Attaleinae ( as it was then defined ) into a single genus , Attalea . In his 1999 Taxonomic Treatment of Palm Subtribe Attaleinae , American botanist Sidney F. Glassman divided the group into five genera — a more narrowly defined Attalea , Orbignya , Maximiliana , Scheelea and Ynesa . Rafäel Govaerts and John Dransfield recognised a single genus in their 2005 World Checklist of Palms , and Jean @-@ Christophe Pintaud continued this usage in his 2008 review of the genus . 
- The multi @-@ genus approach is based solely on the structure of the male flowers ; no other characters could be consistently associated with one genus or another . Four of the genera — Attalea ( in a narrow sense ) , Orbignya , Maximiliana and Scheelea — correspond to four different types of male flowers found within the genus . However , a few species have flowers that are intermediate between these four types , including A. colenda ( which Glassman placed in its own genus , Ynesa ) and this has been used as an argument for the single @-@ genus approach . In addition , there are several hybrids between species that would be considered different genera under Glassman 's five @-@ genus system , which has also been used as an argument for placing them in a single genus . In 2009 Alan Meerow and colleagues published a molecular phylogeny of the subtribe which found that some species placed in Orbignya were actually more closely related to species placed in Scheelea than they were to other members of that genus ( if the five @-@ genus approach was used ) , while A. crassispatha , placed in Orbignya by Glassman , was actually a sister to both Scheelea and Orbignya . 
- 
- = = = History = = = 
- 
- The genus Attalea was first described Carl Sigismund Kunth in 1816 based on specimens collected by Alexander von Humboldt and Aimé Bonpland , although older , pre @-@ Linnaean descriptions exist , including Charles Plumier 's 1703 description of A. crassispatha . The genus was named for Attalus III Philometor , king of Pergamon , known for his interest in medicinal plants . The type species is A. amygdalina , a Colombian endemic . The genera Maximiliana and Orbignya were described by Carl Friedrich Philipp von Martius in 1826 and 1837 respectively . Scheelea was described by Hermann Karsten in 1857 , and Ynesa by Orator F. Cook in 1942 . 
- 
- = = = Species = = = 
- 
- Experts disagree about the number of species in the genus Attalea ( broadly defined ) . In 1965 , Dutch taxonomist Jan Gerard Wessels Boer estimated that there may be as many as 100 species in the genus . In their 1996 Field Guide to the Palms of the Americas Andrew Henderson and coauthors recognised 29 species in the genus , while Sidney Glassman recognised 65 species in his 1999 treatment of the group . Largely following Glassman 's lead , Rafaël Govaerts and John Dransfield recognised 67 species in their 2005 World Checklist of Palms . An important element of this disagreement is the decision by Glassman to define species more narrowly than Henderson . As a result , what Henderson interpreted as variation within species , Glassman took as differences between morphologically similar species . This problem is complicated by the fact that many of these species are poorly represented in herbarium collections . The large size of the leaves , inflorescences and fruit of many Attalea species makes them difficult to collect . In addition , many important collections , including type specimen , have been lost or destroyed . Sparse or incomplete collections make it difficult to differentiate variation within a single species from variation between different species . 
- The three recent treatments ( Henderson and coauthors , Glassman , and Govaerts and Dransfield ) recognised a total of 73 species , but only 20 species are accepted by all of them . The remainder account for either nine species or more than 40 . For example , what Andrew Henderson considered a single species , Attalea attaleoides , other authors have considered a species complex consisting of four or five species . Glassman doubted the validity of A. attaleoides as a species , and described four new species from material that had previously been attributed to A. attaleoides — A. camopiensis , A. degranvillei , A. guianensis and A. maripensis . Govaerts and Dransfield accepted both Glassman 's four species and A. attaleoides . However , Jean @-@ Christophe Pintaud was of the opinion that A. guianensis , A. maripensis and A. attaleoides were all very similar , and thought it likely that they all represented the same species . 
- Another species complex in Attalea includes A. speciosa and related species . Henderson ( 1995 ) recognised A. speciosa and A. spectabilis , considering the latter to either be an acaulescent form of A. speciosa or a hybrid between it and A. microcarpa . Govaerts and Dransfield accepted A. spectabilis , but Glassman considered it a dubious taxon . Attalea vitrivir was recognised as a distinct species by Michael Balick and coauthors ; Glassman and Govaerts and Dransfield concurred , but Henderson considered it part of A. speciosa . Glassman also described a fourth member of this group , A. brejinhoensis , and it is accepted by Govaerts and Dransfield . 
- 
- = = Reproduction and growth = = 
- 
- Attalea species are monoecious — male and female flowers are separate , but are borne by the same plant . Various species have been described as being insect @-@ pollinated , including A. phalerata , while pollination in A. colenda and A. speciosa , has been attributed both to insects and wind . 
- Seed germination is remote tubular — during germination , as the cotyledon expands it pushes the young shoot away from the seed . After germination , the stem initially grows downward before turning to grow upward and produce the aboveground stem . This produces a " saxophone shaped " belowground portion of the stem . The fact that the shoot tips of Attalea seedlings are underground it likely to contribute to their fire @-@ tolerance . 
- 
- = = Distribution = = 
- 
- Species range across the Neotropics from Mexico in the north to Bolivia , Paraguay , and southern Brazil in the south . According to Govaerts and coauthors , three species are found in Mexico , four in Central America , and 62 in South America . Three species are present in the Caribbean — two in Trinidad and Tobago , along the southern edge of the region , and one in Haiti . 
- 
- = = Habitat and ecology = = 
- 
- Attalea includes both large trees and small acaulescent palms which occupy a number of different ecological niches . Dense stands of some of the larger species are conspicuous elements on the landscape , while smaller species are found in both in the forest understorey and in savannas . 
- Disturbance has been implicated in the formation of vegetation dominated by large Attalea species . In seasonally dry Amazonian forests the density of large adult A. maripa palms was correlated with canopy openness ; the species also dominates savannas formed by repeated forest fires in Trinidad and Tobago . Attalea speciosa forms pure stands in many parts of Brazil where natural forest vegetation has been cleared . Similarly , stands of A. funifera in Bahia , Brazil ( which are cultivated for piassava fibre ) are managed using fire — the seedlings survive cutting and burning , and are able to dominate burned forest patches . 
- The fruit are dispersed by animals ; fruit which are not dispersed frequently suffer seed predation by bruchid beetles . Certain species of Attalea have been mentioned as examples of " anachronistic " species which are adapted for dispersal by now @-@ extinct Pleistocene megafauna . On Maracá Island , Roraima , in the Brazilian Amazon , Attalea maripa fruit were consumed by tapirs , collared peccaries , deer and primates . Rodents , including agoutis , fed upon the fruit and , as the fruit availability declined , they fed on the seeds . Other dispersers of Attalea fruit include Crested Caracaras which consume the fruit and disperse the seeds of A. phalerata in the Brazilian Pantanal . 
- 
- = = Uses = = 
- 
- Attalea species have a long history of human utilisation . Carbonised Attalea maripa seeds have been found in archaeological sites in Colombia dating back to 9000 BP . A variety of species remain important sources of edible oil , thatch , edible seeds and fibre . The leaves of Attalea butyracea and A. maripa are used extensively for thatching . Several species are oil palms , with A. speciosa among the most important economically . Products extracted from A. speciosa were reported to support over 300 @,@ 000 households in the Brazilian state of Maranhão in 2005 , and in 1985 it was estimated to support over 450 @,@ 000 households throughout the Brazil . Piassava fibres , extracted from the leaf bases of A. funifera , are commercially important , and generated about US $ 20 million in annual income to Brazilian farmers in 1996 . 
- 
- 
- = The Heart of Ezra Greer = 
- 
- The Heart of Ezra Greer is a 1917 American silent drama film produced by the Thanhouser Company and directed by Emile Chautard . The film focuses on Ezra Greer , a successful middle @-@ aged man who searches for his college age daughter , Mary . The wayward Mary was romanced and abandoned by Jack Denbeigh , later bearing his child . Once Ezra becomes broke he finds employment as the valet for Jack Denbeigh . After Jack 's engagement to a cabaret girl , Mary becomes upset and leaves her child at Jack 's home . Contrary to Jack 's wishes , Ezra keeps the child and Jack ultimately reveals that the child is his own . Ezra convinces Jack to make things right and Ezra convinces the cabaret girl to leave Jack . After a carriage accident in which the baby is injured , Ezra and Jack rush to the hospital and find Mary as a nurse crying over the child . The film ends with the marriage of Jack and Mary . The film was released by Pathé on October 7 , 1917 . The film was the final release from Thanhouser and was deemed to be an average film by most reviewers . Criticism for the film hinged on far @-@ fetched coincidences to drive the plot . The film is presumed lost . 
- 
- = = Plot = = 
- 
- The film follows Ezra Greer , a middle @-@ aged man who has worked hard since his youth . He cares deeply for his motherless daughter , Mary , but was unable to attend the annual commencement at her co @-@ educational college . He awaits for her to return from college , but Mary leaves with her romantic interest , Jack Denbeigh . On promise of marriage and wealth , Mary is romanced and gives birth to a fatherless child . Without word from his daughter , Ezra resigns from his job and attempts to seek her out and finds a poor motherless child , Marie . With Ezra 's money exhausted he seeks employment and finds it as the valet of Jack . 
- One day , Mary seeks an announcement of Jack 's engagement to a cabaret girl known as " The Baby Vamp " . Bitter over the prospect of her child 's future , she leaves the child at Jack 's home during his absence with a note . Jack orders Ezra to take the baby to an orphanage , but Marie begs Ezra to keep him . After continually seeing the child , Jack is overcome with remorse and explains to Ezra and seeks his advice . Not knowing he was making the case for his own daughter , Ezra convinces Jack to seek out Mary and forget the Baby Vamp . The Baby Vamp seeks out Jack , but finds Ezra who convinces her to leave Jack . Jack 's son is later injured in a coach accident and is taken to the hospital . Jack and Ezra rush to the hospital and find Mary , as a nurse , crying over the injured child . Ezra is enraged upon learning that his own daughter was mistreated by Jack , but Mary steps between the two men . Jack apologizes and wants to make it right . The film concludes with Jack and Mary . 
- 
- = = Cast = = 
- 
- Frederick Warde as Ezra Greer 
- Leila Frost as Mary 
- George Forth as Jack Denbeigh 
- Thomas A. Curran as Denbeigh 's guardian 
- Lillian Mueller as Amy Devers 
- Carey L. Hastings as Denbeigh 's housekeeper 
- Helen Badgley as the poor little girl 
- Gerald Badgley as the millionaire 's baby 
- W. Ray Johnston 
- 
- = = Production = = 
- 
- The film was the final production and release of the Thanhouser Company and it was to be released through Pathé . Numerous factors would play into the winding down and eventual closing of the Thanhouser Film Corporation with much advance notice by Edwin Thanhouser . Q. David Bowers writes that it was easy to understand Thanhouser 's decision to retire due to numerous aspects including that releases through Pathé were based on their decision to release or discard the work , the New Rochelle studio was 2 @,@ 500 miles from the center of the trade activity and the slump in industry tied to World War I. Weeks before the film was released , Variety told of the winding down of the Thanhouser with the studio 's staff consisting of Edwin Thanhouser and the bookkeeper , Jessie B. Bishop . The article concluded with the announcement that Lloyd F. Lonergan , the scenario writer of the company , had retired from the company . As it wound down , the Thanhouser Company was announced to have no liabilities would close with a positive bank balance . Little is known of the production of this final film , but it was directed by Emile Chautard from a scenario written by Lloyd F. Lonergan . The cameraman was Jacques Bizeul . 
- 
- = = Release and reception = = 
- 
- The five reel film was released through the Pathé Exchange as a Pathé Gold Rooster Play on October 7 , 1917 . Charles E. Wagner of the Exhibitor 's Trade Review found it to be a good film with great direction and photography , but was concerned that the stunt in which the baby appeared to be involved in the accident was too real . Wagner stated the film had sufficient action and pathos without sexual suggestiveness ; which should prove a strong program for the Pathé program . Frances Agnew of The Morning Telegraph found it to be an average picture that was not exceptional for audiences , but it would hold sentimental appeal for the average viewer . A reviewer for the The New York Dramatic Mirror found the film 's excessive use of coincidental meetings to be highly improbable , but found Warde 's performance to be excellent and the rest of the cast give good performances . The reviewer said that Emile Chautard had made the improbable story more plausible . 
- Like many American films of the time , The Heart of Ezra Greer was subject to cuts by city and state film censorship boards . The Chicago Board of Censors required the cutting in Reel 2 of a letter stating , " I cannot face my father , " etc . , and two closeups of gambling scenes ; and in Reel 5 a change of the intertitle " Because it means her whole future " to " Because she is his wife " . 
- 
- 
- = Free Derry = 
- 
- Free Derry ( Irish : Saor Dhoire ) was a self @-@ declared autonomous nationalist area of Derry , Northern Ireland , that existed between 1969 and 1972 . Its name was taken from a sign painted on a gable wall in the Bogside in January 1969 which read , " You are now entering Free Derry " . The area , which included the Bogside and Creggan neighbourhoods , was secured by community activists for the first time on 5 January 1969 following an incursion into the Bogside by members of the Royal Ulster Constabulary ( RUC ) . Residents built barricades and carried clubs and similar arms to prevent the RUC from entering . After six days the residents took down the barricades and RUC patrols resumed , but tensions remained high over the following months . 
- Violence reached a peak on 12 August 1969 , culminating in the Battle of the Bogside — a three @-@ day pitched battle between residents and the RUC . On 14 August units of the British Army were deployed at the edge of the Bogside and the RUC were withdrawn . The Derry Citizens Defence Association ( DCDA ) declared their intention to hold the area against both the RUC and the British Army until their demands were met . The British Army made no attempt to enter the area . The situation continued until October 1969 when , following publication of the Hunt Report , military police were allowed in . 
- The Irish Republican Army ( IRA ) began to re @-@ arm and recruit after August 1969 . In December 1969 it split into the Official IRA and the Provisional IRA . Both were supported by the people of the Free Derry area . Meanwhile , relations between the British Army and the nationalist community , which were initially good , deteriorated . In July 1971 there was a surge of recruitment into the IRA after two young men were shot and killed by British troops . The government introduced internment on 9 August 1971 , and in response , barricades went up once more in the Bogside and Creggan . This time , Free Derry was a no @-@ go area , defended by armed members of both the Official and Provisional IRA . From within the area they launched attacks on the British Army , and the Provisionals began a bombing campaign in the city centre . As before , unarmed ' auxiliaries ' manned the barricades , and crime was dealt with by a voluntary body known as the Free Derry Police . 
- Support for the IRA increased further after Bloody Sunday in January 1972 , when thirteen unarmed men and boys were shot dead by the British Army 's Parachute Regiment at a march in the Bogside ( a 14th man was wounded and died 4 ½ months later ) . The support began to wane after the killing by the Official IRA of a local youth who was home on leave from the British Army . After a Provisional IRA ceasefire , during which it entered talks with the British government , broke down , the British took the decision to move against the " no @-@ go " areas . Free Derry came to an end on 31 July 1972 , when thousands of British troops moved in with armoured cars and bulldozers to occupy the area . 
- 
- = = Background = = 
- 
- Derry City lies near the border between Northern Ireland and the Republic of Ireland . It has a majority nationalist population , and nationalists won a majority of seats in the 1920 local elections . Despite this , the Ulster Unionist Party controlled the local council , Londonderry Corporation , from 1923 onwards . The Unionists maintained their majority , firstly , by manipulating the constituency boundaries ( gerrymandering ) so that the South Ward , with a nationalist majority , returned eight councillors while the much smaller North Ward and Waterside Ward , with unionist majorities , returned twelve councillors between them ; secondly , by allowing only ratepayers to vote in local elections , rather than one man , one vote , so that a higher number of nationalists , who did not own homes , were disenfranchised ; and thirdly , by denying houses to nationalists outside the South Ward constituency . The result was that there were about 2 @,@ 000 nationalist families , and practically no unionists , on the housing waiting list , and that housing in the nationalist area was crowded and of a very poor condition . The South Ward comprised the Bogside , Brandywell , Creggan , Bishop Street and Foyle Road , and it was this area that would become Free Derry . 
- The Derry Housing Action Committee ( DHAC ) was formed in March 1968 by members of the Derry Branch of the Northern Ireland Labour Party and the James Connolly Republican Club , including Eamonn McCann and Eamon Melaugh . It disrupted a meeting of Londonderry Corporation in March 1968 and in May blocked traffic by placing a caravan that was home to a family of four in the middle of the Lecky Road in the Bogside and staging a sit @-@ down protest at the opening of the second deck of the Craigavon Bridge . After the meeting of Londonderry Corporation was again disrupted in August , Eamon Melaugh telephoned the Northern Ireland Civil Rights Association ( NICRA ) and invited them to hold a march in Derry . The date chosen was 5 October 1968 , an adhoc committee was formed ( although in reality most of the organizing was done by McCann and Melaugh ) and the route was to take the marchers inside the city walls , where nationalists were traditionally not permitted to march . The Minister of Home Affairs , William Craig , made an order on 3 October prohibiting the march on the grounds that the Apprentice Boys of Derry were intending to hold a march on the same day . In the words of Martin Melaugh of CAIN " this particular tactic ... provided the excuse needed to ban the march . " When the marchers attempted to defy the ban on 5 October they were stopped by a Royal Ulster Constabulary ( RUC ) cordon . The police drew their batons and struck marchers , including Stormont MP Eddie McAteer and Westminster MP Gerry Fitt . Subsequently the police " broke ranks and used their batons indiscriminately on people in Duke Street " . Marchers trying to escape met another party of police and " these police also used their batons indiscriminately . " Water cannons were also used . The police action caused outrage in the nationalist area of Derry , and at a meeting four days later the Derry Citizens ' Action Committee ( DCAC ) was formed , with John Hume as chairman and Ivan Cooper as vice @-@ chairman . 
- 
- = = The first barricades = = 
- 
- Another group formed as a result of the events of 5 October was People 's Democracy , a group of students in Queen 's University Belfast . They organised a march from Belfast to Derry in support of civil rights , starting out with about forty young people on 1 January 1969 . The march met with violent opposition from loyalist counter @-@ demonstrators at several points along the route . Finally , at Burntollet Bridge , five miles outside Derry , they were attacked by a mob of about two hundred wielding clubs — some of them studded with nails — and stones . Half of the attackers were later identified from press photographs as members of the B @-@ Specials . The police , who were at the scene , chatted to the B @-@ Specials as they prepared their ambush , and then failed to protect the marchers , many of whom ran into the river and were pelted with stones from the bank . Dozens of marchers were taken to hospital . The remainder continued on to Derry where they were attacked once more on their way to Craigavon Bridge before they finally reached Guildhall Square , where they held a rally . Rioting broke out after the rally . Police drove rioters into the Bogside , but did not come after them . In the early hours of the following morning , 5 January , members of the RUC charged into St. Columb 's Wells and Lecky Road in the Bogside , breaking windows and beating residents . In his report on the disturbances , Lord Cameron remarked that " for such conduct among members of a disciplined and well @-@ led force there can be no acceptable justification or excuse " and added that " its effect in rousing passions and inspiring hostility towards the police was regrettably great . " 
- That afternoon over 1 @,@ 500 Bogside residents built barricades , armed themselves with steel bars , wooden clubs and hurleys , and told the police that they would not be allowed into the area . DCAC chairman John Hume told a meeting of residents that they were to defend the area and no @-@ one was to come in . Groups of men wearing armbands patrolled the streets in shifts . John ' Caker ' Casey , a local activist , painted " You are now entering Free Derry " in white paint on the gable wall of a house on the corner of Lecky Road and Fahan Street . That corner , which was a popular venue for meetings , later became known as " Free Derry Corner " . On 7 January , the barricaded area was extended to include the Creggan , another nationalist area on a hill overlooking the Bogside . A clandestine radio station calling itself " Radio Free Derry " began broadcasting to residents , playing rebel songs and encouraging resistance . On a small number of occasions law @-@ breakers attempted crimes , but were dealt with by the patrols . Despite all this , the Irish Times reported that " the infrastructure of revolutionary control in the area has not been developed beyond the maintenance of patrols . " Following some acts of destruction and of violence late in the week , members of the DCAC including Ivan Cooper addressed residents on Friday , 10 January and called on them to dismantle the barricades . The barricades were taken down the following morning . 
- 
- = = April 1969 = = 
- 
- Over the next three months there were violent clashes , with local youths throwing stones at police . Violence came to a head on Saturday , 19 April after a planned march from Burntollet Bridge to the city centre was banned . A protest in the city centre led to clashes with " Paisleyites " — unionists in sympathy with the anti @-@ civil rights stance of Ian Paisley . Police attempting to drive the protesters back into the Bogside were themselves driven back to their barracks . A series of pitched battles followed , and barricades were built , often under the supervision of Bernadette Devlin , newly elected MP for Mid Ulster . Police pursuing rioters broke into a house in William Street and severely beat the occupant , Samuel Devenny , his family and two friends . Devenny was brought to hospital " bleeding profusely from a number of head wounds . " At midnight four hundred RUC men in full riot gear and carrying riot shields occupied the Bogside . Convoys of police vehicles drove through the area with headlights blazing . 
- The following day , several thousand residents , led by the DCAC , withdrew to the Creggan and issued an ultimatum to the RUC — withdraw within two hours or be driven out . With fifteen minutes of the two hours remaining , the police marched out through the Butcher 's Gate , even as the residents were entering from the far side . The barricades were not maintained on this occasion , and routine patrols were not prevented . 
- Samuel Devenny suffered a heart attack four days after his beating . On 17 July he suffered a further heart attack and died . Thousands attended his funeral , and the mood was sufficiently angry that it was clear the annual Apprentice Boys ' parade , scheduled for 12 August , could not take place without causing serious disturbance . 
- 
- = = August – October 1969 = = 
- 
- The Apprentice Boys ' parade is an annual celebration by unionists of the relief of the Siege of Derry in 1689 , which began when thirteen young apprentice boys shut the city 's gates against the army of King James . At that time the parade was held on 12 August each year . Participants from across Northern Ireland and Britain marched along the city walls above the Bogside , and were often openly hostile to the residents . On 30 July 1969 the Derry Citizens Defence Association ( DCDA ) was formed to try to preserve peace during the period of the parade , and to defend the Bogside and Creggan in the event of an attack . The chairman was Seán Keenan , an Irish Republican Army ( IRA ) veteran ; the vice @-@ chairman was Paddy Doherty , a popular local man sometimes known as " Paddy Bogside " and the secretary was Johnnie White , another leading republican and leader of the James Connolly Republican Club . Street committees were formed under the overall command of the DCDA and barricades were built on the night of 11 August . The parade took place as planned on 12 August . As it passed through Waterloo Place , on the edge of the Bogside , hostilities began between supporters and opponents of the parade . Fighting between the two groups continued for two hours , then the police joined in . They charged up William Street against the Bogsiders , followed by the ' Paisleyites ' . They were met with a hail of stones and petrol bombs . The ensuing battle became known as the Battle of the Bogside . Late in the evening , having been driven back repeatedly , the police fired canisters of CS gas into the crowd . Youths on the roof of a high @-@ rise block of flats on Rossville Street threw petrol bombs down on the police . Walkie @-@ talkies were used to maintain contact between different areas of fighting and DCDA headquarters in Paddy Doherty 's house in Westland Street , and first aid stations were operating , staffed by doctors , nurses and volunteers . Women and girls made milk @-@ bottle crates of petrol bombs for supply to the youths in the front line and " Radio Free Derry " broadcast to the fighters and their families . On the third day of fighting , 14 August , the Northern Ireland Government mobilised the Ulster Special Constabulary ( B @-@ Specials ) , a force greatly feared by nationalists in Derry and elsewhere . Before they engaged , however , British troops were deployed at the scene , carrying automatic rifles and sub @-@ machine guns . The RUC and B @-@ Specials withdrew , and the troops took up positions outside the barricaded area . 
- A deputation that included Eamonn McCann met senior army officers and told them that the army would not be allowed in until certain demands were met , including the disarming of the RUC , the disbandment of the B @-@ Specials and the abolition of Stormont ( the Parliament and Government of Northern Ireland ) . The officers agreed that neither troops nor police would enter the Bogside and Creggan districts . A ' peace corps ' was formed to maintain law and order . When the British Home Secretary , Jim Callaghan , visited Northern Ireland and announced his intention to visit the Bogside on 28 August , he was told that he would not be allowed to bring either police or soldiers with him . Callaghan agreed . Accompanied by members of the Defence Committee , he was " swept along by a surging crowd of thousands " up Rossvile Street and into Lecky Road , where he " took refuge " in a local house , and later addressed crowds from an upstairs window . In preparation for Callaghan 's visit the " Free Derry " wall was painted white and the " You are now entering Free Derry " sign was professionally re @-@ painted in black lettering . 
- Following Callaghan 's visit , some barricades were breached , but the majority remained while the people awaited concrete evidence of reform . Still the army made no move to enter the area . Law and order was maintained by a ' peace corps ' — volunteers organised by the DCDA to patrol the streets and man the barricades . There was very little crime . Punishment , in the words of Eamonn McCann , " as often as not consisted of a stern lecture from Seán Keenan on the need for solidarity within the area . " In September the barricades were replaced with a white line painted on the road . 
- The Hunt Report on the future of policing in Northern Ireland was presented to the Stormont cabinet in early October . Jim Callaghan held talks with the cabinet in Belfast on 10 October , following which the report 's recommendations were accepted and made public . They included the recommendation that the RUC should be ' ordinarily ' unarmed , and that the B @-@ Specials should be phased out and replaced by a new force . The new RUC Chief Constable , Arthur Young , an Englishman , was announced , and travelled to Belfast with Callaghan . The same day , Seán Keenan announced that the DCDA was to be dissolved . On 11 October Callaghan and Young visited Free Derry , and on 12 October the first military police entered the Bogside , on foot and unarmed . 
- 
- = = IRA resurgence = = 
- 
- The Irish Republican Army ( IRA ) had been inactive militarily since the end of the Border Campaign in 1962 . It was low in both personnel and equipment — Chief of Staff Cathal Goulding told Seán Keenan and Paddy Doherty in August 1969 that he " couldn 't defend the Bogside . I haven 't the men nor the guns to do it . " During the 1960s the leadership of the republican movement had moved to the left . Its focus was on class struggle and its aim was to unite the Irish nationalist and unionist working classes in order to overthrow capitalism , both British and Irish . Republican Clubs were formed in Northern Ireland , where Sinn Féin was proscribed . These clubs were involved in the formation of NICRA in 1967 . In Derry , the James Connolly Republican Club worked closely with Labour Party radicals , with whom they set up the Derry Housing Action Committee and Derry Unemployed Action Committee . The Derry Citizens ' Defence Association was formed initially by republicans , who then invited other nationalists to join . Although there were tensions between the younger leaders like Johnnie White and the older , traditional republicans such as Seán Keenan , both sides saw the unrest of 1968 @-@ 69 as a chance to advance republican aims , and the two shared the platform at the Easter Rising commemoration in April 1969 . 
- The events of August 1969 in Derry , and more particularly in Belfast where the IRA was unable to prevent loss of life or protect families burned out of their homes , brought to a head the divisions that had already appeared within the movement between the radicals and the traditionalists , and led to a split in December 1969 into the Official IRA and the Provisional IRA . Initially , both armies organised for defensive purposes only , although the Provisionals were planning towards an offensive campaign . In Derry there was far less hostility between the two organisations than elsewhere and householders commonly paid subscriptions to both . When rioters were arrested after the Official 's Easter parade in March 1970 , Officials and Provisionals picketed their trial together . At the start the Officials attracted most of the younger members . Martin McGuinness , who in August 1969 had helped defend the barricades , initially joined the Officials , but a few months later left to join the Provisionals . 
- Relations between the British Army and the residents had steadily decayed since the first appearance of troops in August 1969 . In September , after clashes between nationalist and unionist crowds that led to the death of a Protestant man , William King , the British Army erected a ' peace ring ' to enclose the nationalist population in the area they had previously controlled . Roads into the city centre were closed at night and people were prevented from walking on certain streets . Although some moderate nationalists accepted this as necessary , there was anger among young people . Clashes between youths and troops became more frequent . The riot following the Officials ' Easter parade in March 1970 marked the first time that the army used ' snatch squads ' , who rushed into the Bogside wielding batons to make arrests . The snatch squads soon became a common feature of army arrest operations . There was also a belief that they were arresting people at random , sometimes days after the alleged offence , and based on the identification of people that they had seen from a considerable distance . The rioters were condemned as hooligans by moderates , who saw the riots as hampering attempts to resolve the situation . The Labour radicals and Official republicans , still working together , tried to turn the youth away from rioting and create socialist organizations — one such organization was named the Young Hooligans Association — but to no avail . The Provisionals , while disapproving of riots , viewed them as the inevitable consequence of British occupation . This philosophy was more attractive to rioters , and some of them joined the Provisional IRA . The deaths of two leading Provisionals in a premature explosion in June 1970 resulted in young militants becoming more prominent in the organization . Nevertheless , up to July 1971 the Provisional IRA remained numerically small . 
- Two men , Séamas Cusack and Desmond Beattie , were shot dead in separate incidents in the early morning and afternoon of 8 July 1971 . They were the first people to be killed by the British Army in Derry . In both cases the British Army claimed that the men were attacking them with guns or bombs , while eyewitnesses insisted that both were unarmed . The Social Democratic and Labour Party ( SDLP ) , the newly formed party of which John Hume and Ivan Cooper were leading members , withdrew from Stormont in protest , but among residents there was a perception that moderate policies had failed . The result was a surge of support for the IRA . The Provisionals held a meeting the following Sunday at which they called on people to " join the IRA " . Following the meeting , people queued up to join , and there was large @-@ scale rioting . The British Army post at Bligh 's Lane came under sustained attack , and troops there and around the city came under fire from the IRA . 
- 
- = = Internment and the third Free Derry = = 
- 
- The increasing violence in Derry and elsewhere led to increasing speculation that internment without trial would be introduced in Northern Ireland , and on 9 August 1971 hundreds of republicans and nationalists were arrested in dawn raids . In Derry , residents came out onto the streets to resist the arrests , and fewer people were taken there than elsewhere ; nevertheless leading figures including Seán Keenan and Johnnie White were interned . In response , barricades were erected once again and the third Free Derry came into existence . Unlike its predecessors , this Free Derry was marked by a strong IRA presence , both Official and Provisional . It was defended by armed paramilitaries — a no @-@ go area , one in which British security forces were unable to operate . 
- Gun attacks on the British Army increased . Six soldiers were wounded in the first day after internment , and shortly afterwards a soldier was killed — the first to be killed by either IRA in Derry . The army moved in in force on 18 August to dismantle the barricades . A gun battle ensued in which a young Provisional IRA officer , Eamonn Lafferty , was killed . A crowd staging a sit @-@ down protest was hosed down and the protesters , including John Hume and Ivan Cooper , arrested . With barricades re @-@ appearing as quickly as they were removed , the army eventually abandoned their attempt . 
- The Derry Provisionals had little contact with the IRA elsewhere . They had few weapons ( about twenty ) which they used mainly for sniping . At the same time , they launched their bombing campaign in Derry . Unlike in Belfast , they were careful to avoid killing or injuring innocent people . Eamonn McCann wrote that " the Derry Provos , under Martin McGuinness , had managed to bomb the city centre until it looked as if it had been hit from the air without causing any civilian casualties . " 
- Although both IRAs operated openly , neither was in control of Free Derry . The barricades were manned by unarmed ' auxiliaries ' . Crime was dealt with by a volunteer force called the Free Derry Police , which was headed by Tony O 'Doherty , a Derry footballer and Northern Ireland International . 
- 
- = = Bloody Sunday = = 
- 
- An anti @-@ internment protest organised by the Northern Ireland Civil Rights Association ( NICRA ) at Magilligan Camp in January 1972 was met with violence from the 1st Battalion , The Parachute Regiment ( 1 Para ) . NICRA had organised a march from the Creggan to Derry city centre , in defiance of a ban , on the following Sunday , 30 January 1972 . Both IRAs were asked , and agreed , to suspend operations on that day to ensure the march passed off peacefully . The British Army erected barricades around the Free Derry area to prevent marchers from reaching the city centre . On the day , march organisers turned the march away from the barriers and up to Free Derry Corner , but some youths proceeded to the barrier at William Street and stoned soldiers . Troops from 1 Para then moved into Free Derry and opened fire , killing thirteen people , all of whom were subsequently found to be unarmed . A fourteenth shooting victim died four months later in June 1972 . Like the killing of Cusack and Beattie the previous year , Bloody Sunday had the effect of hugely increasing recruitment to the IRA , even among people who previously would have been ' moderates ' . 
- 
- = = February - July 1972 = = 
- 
- Both the Provisional and Official IRA stepped up attacks after Bloody Sunday , with the tacit support of the residents . Local feelings changed , however , with the killing of Ranger William Best by the Official IRA . Best was a 19 @-@ year @-@ old local man who was home on leave from the British Army at his parents ' house in the Creggan . He was abducted , interrogated and shot . The following day 500 women marched to the Republican Club offices in protest . Nine days later , on 29 May , the Official IRA declared a ceasefire . The Provisional IRA initially stated that they would not follow suit , but after informal approaches to the British Government they announced a ceasefire from 26 June . Martin McGuinness was the Derry representative in a party of senior Provisionals who travelled to London for talks with William Whitelaw , the Secretary of State for Northern Ireland . The talks were not resumed after the ending of the truce following a violent confrontation in Belfast when troops prevented Catholic families from taking over houses in the Lenadoon estate . 
- Political pressure for the action against the " no @-@ go " areas increased after the events of Bloody Friday in Belfast . A British Army attack was considered inevitable , and the IRA took the decision not to resist it . On 31 July 1972 , Operation Motorman was launched when thousands of British troops , equipped with armoured cars and armoured bulldozers ( AVREs ) , dismantled the barricades and occupied the area . 
- 
- = = Subsequent history = = 
- 
- After Operation Motorman , the British Army controlled the Bogside and Creggan by stationing large numbers of troops within the area , by conducting large @-@ scale ' search ' operations that were in fact undertaken for purposes of intelligence gathering , and by setting up over a dozen covert observation posts . Over the following years IRA violence in the city was contained to the point where it was possible to believe ' the war was over ' in the area , although there were still frequent street riots . Nationalists — even those who did not support the IRA — remained bitterly opposed to the army and to the state . 
- Many of the residents ' original grievances were addressed with the passing of the Local Government ( Northern Ireland ) Act , 1972 , which redrew the electoral boundaries and introduced universal adult suffrage based on the single transferable vote . Elections were held in May 1973 . Nationalists gained a majority on the council for the first time since 1923 . Since then the area has been extensively redeveloped , with modern housing replacing the old houses and flats . The Free Derry era is commemorated by the Free Derry wall , the murals of the Bogside Artists and the Museum of Free Derry . 
- 
- 
- = Come What ( ever ) May = 
- 
- Come What ( ever ) May is the second studio album by American alternative metal band Stone Sour . It was recorded and produced by the band and Nick Raskulinecz at Studio 606 in Los Angeles , California , and was released on August 1 , 2006 , through Roadrunner Records . Writing for the album began as early as 2003 when vocalist Corey Taylor and guitarist James Root were writing material for their other band , Slipknot . In January 2006 Stone Sour began recording the follow @-@ up to their 2002 debut album Stone Sour , during which time drummer Joel Ekman left the band due to family constraints . He was eventually replaced by ex @-@ Soulfly drummer Roy Mayorga who played on all but two tracks on the album . 
- Following the release of the album , Stone Sour went on to promote it for over a year ; releasing five singles and touring in several regions , including the United States , Canada , Japan and several countries in Europe . The album received generally positive reviews . It was praised for showing a progression in the band 's song writing ability and musical style . It was also certified Gold in the United States and Canada and the single " 30 / 30 @-@ 150 " was nominated for Best Metal Performance at the 49th Grammy Awards . On June 26 , 2007 Stone Sour released a special edition version of the album , it included six previously unreleased tracks and a bonus DVD which featured three music videos and a complete live performance of the band in Moscow . It remains their best @-@ selling album to date , mostly due to the success of the single " Through Glass . " 
- 
- = = Production = = 
- 
- In September 2005 , lead singer Corey Taylor announced that Stone Sour would return with a second album . He said that they had written over 30 songs , some during the writing process of Vol . 3 : ( The Subliminal Verses ) , the third album by vocalist Taylor and guitarist James Root 's other band Slipknot , and that they were working on demoing the tracks before entering the studio . Dave Fortman was originally slated to produce the album , however , on January 22 , 2006 Stone Sour began working on the album with producer Nick Raskulinecz at Dave Grohl 's personal studio ( Studio 606 ) , in Los Angeles . Time in the studio began with a week of pre @-@ production , during which guitarist Josh Rand says producer Raskulinecz " pushed [ the band ] to the brink and back " to help fine @-@ tune the songs they had previously written . Though Rand and Taylor wrote most of the music and lyrics for the first album , respectively , writing for Come What ( ever ) May was done by all members . 
- Following this , the band set out to record 18 tracks and work began on recording Joel Ekman 's drum tracks . However , Ekman was forced to leave the studio after four weeks due to his young son 's diagnosis of a brainstem glioma . With the fate of the album in jeopardy , Stone Sour recruited ex @-@ Soulfly member Roy Mayorga as a session drummer . Mayorga recorded drums for all but two tracks on the album , Godsmack drummer Shannon Larkin performed on the track " 30 / 30 @-@ 150 " and guitarist Root performed drums on the bonus track " The Day I Let Go . " In an interview with Revolver during the recording process vocalist Taylor talked about the differences between this album and their previous album , Stone Sour . He said that pressures from fans and the record label were much larger ; also noting that he " thrives on the pressure , because it gets [ him ] going . " While promising that " the album 's gonna be miles above the first one , " Taylor explained that it is " more melodic and darker " . In late March 2006 , drummer Joel Ekman officially left Stone Sour and the band was talking with a few drummers who could replace him . On April 7 , 2006 the recording sessions for Come What ( ever ) May concluded . A month later session drummer Roy Mayorga joined Stone Sour on a full @-@ time basis . 
- 
- = = Promotion = = 
- 
- It was announced in March 2006 that Stone Sour 's second album , which was tentatively titled " Come What May , " would be released on July 18 , 2006 . However , the release date for the album was pushed back until August 22 . Due to the delay Stone Sour released a music video for the track " Reborn " , which featured footage of the band working on the album in the studio . The cover artwork from the album was released online on May 20 , 2006 . Shortly after , it was confirmed by a representative from the band 's record label Roadrunner that the release date had been brought forward , and the official release date would be August 1 , 2006 . On July 31 , 2006 , the day before its release the album was made available online for streaming in its entirety through AOL . 
- On May 22 , 2006 the first single from the album , " 30 / 30 @-@ 150 " , was made available online as a free MP3 download . A music video for the single was shot with director P.R. Brown in Los Angeles , the video received a premier on MTV 's Headbangers Ball on June 3 , 2006 . Prior to the release of the second single from the album , " Through Glass " , radio stations throughout the US showed high support for the song . A music video for the single was shot with director Tony Petrossian and was released on June 9 , 2006 online through Yahoo ! . The third single from the album , " Sillyworld " , began receiving radio airplay in November 2006 . A music video for the single was shot in January 2007 and was released online on March 8 , 2007 . The fourth single from the album , " Made of Scars " , featured a music video which was recorded live on April 7 , 2007 and was posted online on June 5 , 2007 . The fifth and final single from the album , " Zzyzx Rd . " , started receiving radio airplay in Fall 2007 and no music video was made for the single . 
- The band began touring in support of the album prior to its release , initiating touring with several free shows in the US . Followed by multiple appearances at festivals in Europe . They then joined Korn for their 2006 edition of Family Values Tour across the US , which featured 33 dates across 3 months . On August 8 , 2006 Stone Sour made a special guest appearance on The Tonight Show with Jay Leno to promote and perform their second single " Through Glass . " They also performed at the Japanese festival Summer Sonic midway through the Family Values Tour . Then through November and December 2006 , Stone Sour joined Disturbed for their Music as a Weapon Tour . In January 2007 Stone Sour joined Evanescence for a Canadian tour , followed by a headlining tour of Europe . They then headlined the Spring 2007 Jägermeister Music Tour across the US , followed by headlining tours in Australia and Japan . They then started a tour in Europe playing festivals and select headline shows . They wrapped up touring in support of the album with a headlining tour in the US through August and September in 2007 . 
- 
- = = = Special edition = = = 
- 
- On June 26 , 2007 , Stone Sour released a special edition version of the album with six previously unreleased tracks and a bonus DVD . The DVD featured a full concert performance by the band from October 2006 in Moscow and the music videos for " 30 / 30 @-@ 150 , " " Through Glass , " and " Sillyworld . " When talking about the special edition , vocalist Taylor said , " we really wanted to do something which was really cool , " saying that this shows the band 's different musical elements and them in their live element , which he says " people really gravitate towards . " In addition to this , Stone Sour released a live album of their concert in Moscow exclusively on iTunes , entitled Live in Moscow . 
- 
- = = Musical style = = 
- 
- In an interview with MTV in 2006 , vocalist Corey Taylor said that Come What ( ever ) May was a return to the roots of the band , stating it is " a lot more from the spirit of what the band started with in 1992 . " Noting how some songs were " very atmospheric , " while others maintained " the hard rock and the heavy stuff . " Jon Wiederhorn of MTV said that " for every thrash riff there 's a tunefully grungy passage , for every flailing guitar line there 's a rock @-@ radio hook . " When talking about the track " 30 / 30 @-@ 150 " , he said parts are " bludgeoning , barbed and heavy , " while others are " soaring and triumphant , " with the production of Raskulinecz helping balance the album 's heaviness with its radio @-@ accessibility . Come What ( ever ) May 's lyrics include themes of " pain , pleasure , happiness , and grief . " The diversity in subjects is evident throughout the album , songs including " Come What ( ever ) May " were politically influenced while the track " Socio " is about " social anxiety attacks " that vocalist Taylor suffered . " Zzyzx Rd " is a love song written to Taylor 's wife for helping him in his struggles against alcoholism and contemplation of suicide . " I 've never written anything like that before , but it was very important for me to tell the world not only how much she saved me , but how much she means to me , " said Taylor . Taylor said there is a common thread with the lyrics throughout the album , saying that they are " about never forgetting where you came from , who you are and why you do this . " 
- 
- = = Reception = = 
- 
- Come What ( ever ) May was met with generally positive critical reviews . Several reviewers noted on how it helped to further establish Stone Sour . Chad Bower of About.com stated that the band had " progressed a lot since their debut " , noting that the album was " very diverse and [ allows ] the band to show many different sides of their musical personality . " Megan Frye of Allmusic opens her review of the album by distinguishing what sets Stone Sour apart musically , stating " [ it 's their ] ability to create smooth , radio @-@ friendly alternative metal songs while simultaneously not boring the people who have heard way too much from post @-@ grunge groups . " On a similar note , Michael Melchor of 411mania said " the band is much better at the craft of songwriting than many of their peers . " However , in contrast , reviewer William Fry of IGN criticized the album , saying " Stone Sour doesn 't do anything inspired , original , or fresh here " even calling the album " completely misdirected , and stonewalled . " A particular point of interest for reviewers was how Come What ( ever ) May is more melodic than their previous album Stone Sour . Melchor of 411mania said the album is " much more liberal with the balladry and acoustic sounds than its predecessor , " noting on the track " Sillyworld " he said " it sounds like what Nickelback could be if Chad Kroeger could write a good melody " . In his review , Chad Bower labeled Come What ( ever ) May as a " very melodic and accessible album " stating that " it has a little something for everyone . " Similarly , Megan Frye triumphed the album as an " unyielding effort from a promising talent " . 
- Come What ( ever ) May sold over 80 @,@ 000 copies in its first week and debuted at the fourth spot on the Billboard 200 in the United States , and went on to be certified gold in the UK , Canada and the United States . In 2007 , the single " 30 / 30 @-@ 150 " was nominated for Best Metal Performance at the 49th Grammy Awards . 
- 
- = = Track listing = = 
- 
- All lyrics written by Corey Taylor , all music composed by Stone Sour . 
- On the iTunes deluxe version , the pop version of " Zzyzx Rd . " replaced the original version as the 12th track . 
- 
- = = = Special edition DVD = = = 
- 
- 
- = = Personnel = = 
- 
- 
- = = Chart positions = = 
- 
- 
- 
- = Chad at the 2008 Summer Olympics = 
- 
- Chad sent a delegation of two athletes to compete at the 2008 Summer Olympics in Beijing , China : Moumi Sébergué , who competed in the men 's 100 meters , and Hinikissia Albertine Ndikert , who competed in the women 's 100 meters and also bore the Chadian flag during ceremonies . The appearance of this delegation marked the tenth appearance of Chad at the Summer Olympics , the first been in 1964 Summer Olympics in Tokyo , Japan , and its seventh appearance since its Olympic hiatus between 1976 and 1980 . Both Sébergué and Ndikert ranked seventh in their respective heats and did not advance past the qualification round . As of the end of the 2012 London Olympics , there have been no medalists from Chad . 
- 
- = = Background = = 
- 
- Chad is a landlocked country in Africa whose northern region lies within the eastern reaches of the Sahara Desert and whose southern region lies within the eastern portion of the Sahel . It borders Libya to the south , Niger to the east , Sudan to the west , and the Central African Republic to the north . Chad was originally part of French West Africa until 1960 , when it declared independence . Some four years later , the former French colony made its début at the 1964 Summer Olympics in Tokyo , Japan . For the next three decades , Chad became embroiled in civil war and experienced invasions by Libya and upheavals by Sudanese @-@ backed rebels ; the civil war ended in 1990 , although rebel threats had persisted between then and 2008 . During Chad 's greatest era of instability , athletes from the country did not attend the 1976 Summer Olympics in Montréal , Canada or the 1980 Summer Olympics in Moscow , USSR , although delegations were sent to all other games between 1964 and 2008 . 
- The largest Chadian delegation to reach the Olympics appeared in the 1988 Summer Olympics in Seoul , South Korea and at the 1992 Summer Olympics in Barcelona , Spain ; each time , Chad 's National Olympic Committee sent six athletes . During the 1992 games , the NOC sent the nation 's first female Olympian . Since then ( and up to the Beijing games ) , at least one woman has been a part of the Chadian delegation . The smallest contingency of Chadian Olympians occurred during the 2004 Summer Olympics in Athens , Greece , when only Kaltouma Nadjina competed on the country 's behalf . The delegation that arrived in Beijing consisted of two athletes — one man ( 30 @-@ year @-@ old Moumi Sébergué ) and one woman ( 15 @-@ year @-@ old Hinikissia Albertine Ndikert ) , both participants in track events . Ndikert was Chad 's flagbearer at the ceremonies . Up to and including the Beijing games , there has yet to have been a medalist from Chad . 
- 
- = = Athletics = = 
- 
- Competitors in athletics events could qualify for the next round of competition in two ways . Qualifying by right was posting a high result in their own heat , and qualifying by result was posting a high result in overall standings . Ranks shown are thus those within each heat , not in overall standings . 
- Moumi Sébergué represented Chad at the Beijing Olympics in the men 's 100 meters dash . Born in 1977 , Sébergué first participated in the Olympics at age 22 when he raced in the men 's 100 meters at the 2000 Summer Olympics in Sydney , Australia , placing seventh in his qualification heat and not progressing to later rounds . He did not attend the 2004 Summer Olympics in Athens , Greece , but returned to the Olympics at Beijing at the age of 30 . During the course of the August 14 , 2008 races in his event , when the qualification round took place , Sébergué competed in the tenth heat against seven other athletes . He finished the race in 11 @.@ 14 seconds , placing seventh in the heat ahead of Tuvalu 's Okinali Tinilau ( 11 @.@ 48 seconds ) and behind Gabon 's Wilfried Bingangoye ( 10 @.@ 87 seconds ) in a heat led by the Netherlands Antilles ' Churandy Martina ( 10 @.@ 35 seconds ) and Japan 's Naoki Tsukahara ( 10 @.@ 39 seconds ) . Of the 80 athletes who participated in the events , the Chadian sprinter ranked 70th . He did not advance to later rounds . 
- Hinikissia Albertine Ndikert competed on Chad 's behalf as the national delegation 's only female athlete at the Beijing games . She participated in the women 's 100 meters dash , and was 15 years old at the time of the competition . Ndikert had not previously competed in any Olympic games . During the qualification round of the event , which took place on August 15 , 2008 , Ndikert competed in the eighth heat against seven other athletes . She finished the race in 12 @.@ 55 seconds , placing seventh ; she defeated the Democratic Republic of the Congo 's Franka Magali ( 12 @.@ 57 seconds ) and fell behind Papua New Guinea 's Mae Koime ( 11 @.@ 68 seconds ) in a heat led by Nigeria 's Damola Osayomi ( 11 @.@ 13 seconds ) and the Bahamas ' Debbie Ferguson @-@ McKenzie ( 11 @.@ 17 seconds ) . Of the event 's 85 competitors , Ndikert finished in 64th place . Therefore , Ndikert did not advance to round two and beyond . 
- Key 
- Note – Ranks given for track events are within the athlete 's heat only 
- Q 
- = Qualified for the next round 
- q = 
- Qualified for the next round as a fastest loser or , in field events , by position without achieving the qualifying target 
- NR 
- = National record 
- N / A = 
- Round not applicable for the event 
- Bye = Athlete not required to compete in round 
- 
- 
- = View of the World from 9th Avenue = 
- 
- View of the World from 9th Avenue ( sometimes A Parochial New Yorker 's View of the World , A New Yorker 's View of the World or simply View of the World ) is a 1976 illustration by Saul Steinberg that served as the cover of the March 29 , 1976 , edition of The New Yorker . The work presents the view from Manhattan of the rest of the world showing Manhattan as the center of the world . 
- View of the World has been parodied by Ted Rall , Columbia Pictures , The New Yorker , The Economist and Mad Magazine , among others . The work has been imitated and printed without authorization in a variety of ways . The Columbia parody led to a ruling by the United States District Court for the Southern District of New York in Steinberg v. Columbia Pictures Industries , Inc. in favor of Steinberg because of copyright violations by Columbia Pictures . The work is regarded as one of the greatest magazine covers of recent generations and is studied by art students around the world . 
- 
- = = Background = = 
- 
- Saul Steinberg created 85 covers and 642 internal drawings and illustrations for The New Yorker , including its March 29 , 1976 cover , titled " View of the World from 9th Avenue " . This is regarded as his most famous work . It is considered an example of unintentional fame : Steinberg has noted that the type of fame that resulted from the work has diminished his significance to " the man who did that poster " . The work is sometimes referred to as A Parochial New Yorker 's View of the World or A New Yorker 's View of the World because it depicts a map of the world as seen by self @-@ absorbed New Yorkers . At one point The New Yorker applied for a copyright from the United States Copyright Office for the work . It assigned the copyright to Steinberg and subsequently reproduced posters of the painting . 
- 
- = = Detail = = 
- 
- The illustration is split in two parts , with the bottom half of the image showing Manhattan 's 9th Avenue , 10th Avenue , and the Hudson River ( appropriately labeled ) , and the top half depicting the rest of the world . It is a westward view over 10th Avenue . The rest of the United States is the size of the three New York City blocks and is drawn as a rectangle bounded by North American neighbors Canada and Mexico , with a thin brown strip along the Hudson representing " Jersey " , the names of five cities ( Los Angeles ; Washington , D.C. ; Las Vegas ; Kansas City ; and Chicago ) and three states ( Texas , Utah , and Nebraska ) scattered among a few rocks for the United States beyond New Jersey , which is in bolder font than the rest of the country beyond the Hudson . Washington , D.C. is depicted as a remote location near Mexico . The Pacific Ocean , slightly wider than the Hudson , separates the United States from three flattened land masses labeled China , Japan and Russia . Notably , the image depicts the world with a back turned to Europe , which is absent from the painting . 
- The work is composed in ink , pencil , colored pencil , and watercolor on paper and measures 28 by 19 inches ( 71 cm × 48 cm ) . When exhibiting this work along with alternate versions and sketches , the University of Pennsylvania summarized the work as a " bird 's @-@ eye view of the city from Ninth Avenue in a straight line westward , with space becoming ever more condensed ... " They also described the work as a tongue @-@ in @-@ cheek view of the world . New York interpreted the New York @-@ centric mind 's view of the rest of the world as a set of outer boroughs as iconic . National Post journalist Robert Fulford described the perspective as one in which the entire world is a suburb of Manhattan . 
- 
- = = Parodies = = 
- 
- View of the World has been imitated without authorization in a variety of ways . The work has been imitated in postcard format by numerous municipalities , states and nations . Steinberg had stated that he could have retired on royalties from the many parodies made of the painting , had they been paid , a motivation for his eventual copyright lawsuit for the Moscow on the Hudson use . Fulford , writing in The National Post , noted that the metaphor of the world as a suburb of Manhattan was " understood and borrowed " by the whole world . Local artists , especially poster artists , presented similarly compelling depictions of their own provincial perceptions . Fulford demonstrated the prominence of this work by mentioning that a high school in suburban Ottawa made imitating View of the World an assignment in its graphic arts class . He also noted that the results of this assignment was a worldwide variety of global foci from which the students viewed the world . 
- The illustration — humorously depicting New Yorkers ' self @-@ image of their place in the world , or perhaps outsiders ' view of New Yorkers ' self @-@ image — inspired many similar works , including the poster for the 1984 film Moscow on the Hudson ; that movie poster led to a lawsuit , Steinberg v. Columbia Pictures Industries , Inc . , 663 F. Supp . 706 ( S.D.N.Y. 1987 ) , which held that Columbia Pictures violated the copyright Steinberg held on his work . 
- On June 5 , 2003 , during the first term of George W. Bush 's presidency , Ted Rall presented A View of the World from Pennsylvania Avenue as a Parody of View of the World from 9th Avenue in a Universal Press Syndicate editorial cartoon . He replaced the letters representing The New Yorker with The Bushie . 
- The cover was later satirized by Barry Blitt for the cover of The New Yorker on October 6 , 2008 . The cover featured Sarah Palin looking out of her window seeing only Alaska , with Russia in the far background . 
- The March 21 , 2009 The Economist included a story entitled " How China sees the World " that presents a parody that is also an homage to the original image , but depicting the viewpoint from Beijing 's Chang 'an Avenue instead of Manhattan . A caption above the illustration reads " Illustration by Jon Berkeley ( with apologies to steinberg and The New Yorker ) " . It accompanied an article that discussed the burgeoning Chinese economy at the time of the contemporary financial crisis . 
- The October 1 , 2012 cover of Mad Magazine satirized the problems with the September release of Apple Inc . ' s iOS 6 mobile operating system which included Apple Maps , a replacement for Google Maps . The work presents what View of the World might look like if one had relied upon the September 2012 version of Apple Maps to locate various landmarks . 
- 
- = = Critical review = = 
- 
- On October 17 , 2005 , American Society of Magazine Editors unveiled its list of the top 40 magazine covers of the prior 40 years and ranked View of the World from 9th Avenue in fourth place . The listing stated that the work " ... has come to represent Manhattan 's telescoped perception of the country beyond the Hudson River . The cartoon showed the supposed limited mental geography of Manhattanites . " 
- 
- 
- = Bintulu = 
- 
- Bintulu / biːnˈtuːluː / ( Chinese : 民都魯 ; pinyin : Míndūlǔ ; Pe ̍ h @-@ ōe @-@ jī : Bîn @-@ to ͘ -ló ͘ ) is a coastal town on the island of Borneo in the central region of Sarawak , Malaysia . Bintulu is located 610 kilometres ( 380 mi ) northeast of Kuching , 216 kilometres ( 134 mi ) northeast of Sibu , and 200 kilometres ( 120 mi ) southwest of Miri . With a population of 114 @,@ 058 as of 2010 , Bintulu is the capital of the Bintulu District of the Bintulu Division of Sarawak , Malaysia . 
- The name of Bintulu was derived from the local native language " Mentu Ulau " ( picking heads ) . Bintulu was a small fishing village when Rajah James Brooke acquired it in 1861 . Brooke later built a fort there in 1862 . In 1867 , the first General Council meeting ( now Sarawak State Legislative Assembly ) was convened in Bintulu . It is the earliest state legislature system in Malaysia . The construction of the earliest airstrip in Bintulu began in 1934 but was halted in 1938 due to financial difficulties . During World War II , the airstrip was heavily bombed by Allied forces . The British later rebuilt the airstrip , and it became fully operational in 1955 . The old airport was replaced by a new airport in 2002 . Bintulu remained a fishing village until 1969 when oil and gas reserves were discovered off the coast . Since then , Bintulu has become the centre of energy intensive industries such as a Malaysia LNG plant , a Shell Middle Distillate Synthesis plant , and a Bintulu combined cycle power plant . The economy has also expanded into oil palm and forest plantations , palm oil processing , wood @-@ waste processing , and cement manufacturing . The port of Bintulu is the busiest in Sarawak . The town is also a gateway to Samalajau Industrial Park . 
- Among the tourist attractions in Bintulu are Similajau National Park , Tumbina Park , Tanjung Batu beach , Jepak village , Kuan Yin Tong temple , Assyakirin mosque , Council Negri monument , Tamu Bintulu , and Pasar Utama markets . The Borneo International Kite Festival is held annually in the town . 
- 
- = = Etymology = = 
- 
- During the 16th century , Bintulu was named " River de Burulu " by Portuguese cartographers . There are several legends surrounding the name Bintulu . During the Brooke dynasty , the indigenous Iban people practised headhunting to maintain their social status in the community . They threw the heads into the Kemena River , after which the heads had to be collected from the river . The practice of collecting the heads was known as " Mentu Ulau " ( picking heads ) in the local native language . Another story relates that two Iban warriors named Bernik and Jelab built houses along the river . They and their followers frequently carried out preservation of severed heads near a small river stream branching off from Sebezaw River because the river bank was flat and wide . Therefore , the small river stream was named " Mentu Ulau " river . Outsiders who came to Bintulu subsequently pronounced the name as " Mentulau " , and later the name evolved into " Bentulu " and , finally , " Bintulu " . 
- 
- = = History = = 
- 
- 
- = = = Brooke dynasty = = = 
- 
- James Brooke was appointed the White Rajah of Sarawak ( now known as Kuching ) by the Bruneian Empire in 1841 . In 1861 , the Sultanate of Brunei ceded the Bintulu region to Brooke . Bintulu was a small settlement at that time . A wooden fort named Fort Keppel was built in the village , named after Sir Henry Keppel , who was a close friend of the Rajah James and Charles Brooke . Sir Henry Keppel was responsible for crushing the Dayak piracy in the Saribas between 1840 and 1850 . Meanwhile , Charles Brooke was a nephew of James Brooke and would later become the latter 's successor as the second Rajah of Sarawak . Odoardo Beccari , an Italian botanist , visited Bintulu in 1867 . On 4 August , he started his journey on a gunboat named " Heartsease " , which was to send $ 6 @,@ 000 to Brunei for concessions being made to James Brooke in the Mukah and Bintulu regions . He went to Labuan before coming back to Bintulu . He dropped off at Kemena River on 13 August 1867 . His observations of the village were recorded as follows : 
- The fort of Bintulu which was built entirely of wood , was in somewhat ruinous condition . It stood nearly on the sea @-@ shore , and just behind it , at a distance of few paces , the primeval forests commenced ... Some chinamen had settled at the vicinity of the fort and had built a small bazaar ; but the village is chiefly formed by the houses of the Melanau beyond the Chinese kampong ( village ) . These Melanaus used to live further up the river , but since the construction of the fort , and the installation of an officer of the Rajah near the mouth of the river , they came to settle near the sea – a thing they would never have dared to do in former days for fear of the attacks of the Lanun pirates and Dayak pirates . 
- The houses of the Melanau people were built in rows on both sides of the Kemena River , mostly furnished by Nipah and Sago palms . Each house had its own shed projection into the entrance of the river , which was used for the processing of Sago palms . On 8 September 1867 , the first Sarawak General Council meeting ( now Sarawak State Legislative Assembly ) took place here . It was made up of 21 elected local community members ( five British officers and 16 Malay and Melanau local chiefs ) . The Council was formed by Raja Muda Charles Brooke under orders from Rajah James Brooke . The Council is the oldest state legislative assembly in Malaysia . 
- 
- = = = Japanese occupation = = = 
- 
- During World War II , Rajah Charles Vyner Brooke ordered the construction of airstrips in Kuching , Oya , Mukah , Bintulu , and Miri . Construction of the Bintulu airstrip was started in 1934 under the direction of C. W. Bailey , a Works and Building Inspector for the British Royal Air Force ( RAF ) . All the airstrips were completed except for the Bintulu airstrip , where construction was discontinued in October 1938 due to financial reasons . Japanese forces landed in Miri on 16 December 1941 . Sarawak fell into Japanese hands when they conquered Kuching on 24 December 1941 . When the Japanese invaded Sarawak , Charles Vyner Brooke already left for Sydney ( Australia ) before the attack while his officers were captured by the Japanese and interned at the Batu Lintang camp . During the Japanese occupation , the Japanese used the airstrip for military purposes . However , the airstrip was heavily bombed by Allied forces . The British began reconstruction of the airstrip after the war ; during the project many unexploded bombs were unearthed . 
- On 5 September 1942 , Japanese Field Marshal Prince Maida ( 前田利为 ) boarded a plane from Kuching to Labuan to officiate an airport that bears his name . However , he never arrived . One month later , the plane was found to have crashed off the coast of Tanjung Datu , Bintulu . The cause of the plane crash was not known . The Japanese later set up a wooden pole memorial made up of Belian wood in Bintulu . The wooden pole was later taken back to Japan by the family of Prince Maida . 
- Chinese sawmill owners at Sibu and Bintulu were instructed by the Japanese to produce timber for repairs at oil fields and ship building . During the Japanese occupation , sawmills at Bintulu produced a total of 4 @,@ 000 tons of sawn timber . 
- 
- = = = Post @-@ war period = = = 
- 
- In the 1950s , major economic activities in Bintulu were the timber extraction industry , fishing , and Sago processing . In the 1960s , Bintulu was still a small fishing village , with a population of 5 @,@ 000 . No roads were constructed in Bintulu until 1969 when the first untarred road was built to connect Bintulu to Miri . The first bus that serviced the Miri – Bintulu route was owned by Majlis Amanah Rakyat ( MARA ) . The MARA bus line was an initiative by the Malaysian federal government to provide public transportation for the people . The Iban villagers paid the bus driver with " vegetables , chickens , bamboo shoots , and other items " . Before 1960 , Bintulu was connected to Kuching by sea through a ship named " Swee Joo " . After 1960 , the ship " Chin Chin " was added to the route . It took around 36 to 48 hours to reach Bintulu from Kuching , depending on the sea conditions . Due to lack of food supplies from Kuching , the villagers had to make do with limited food , and several villagers resorted to hunting in the jungles to supplement the food supply . 
- In 1960 there were only three primary schools in Bintulu . These schools provided classes until Primary 3 level . There were no secondary schools . Villagers could pursue their secondary school studies at either Miri or Kuching by using small boats as there were no roads connecting Bintulu to either Miri or Kuching . Bintulu Government Secondary School was opened in 1964 . In 1967 Bintulu celebrated the first 100 years of the Council Negri meeting ( Sarawak State Legislative Assembly ) . A stone monument was built in front of a government rice storeroom to commemorate the event . Bintulu was a sub @-@ district of Miri Division in the 1970s . The sub @-@ district was upgraded into a district in 1987 . 
- 
- = = = Discovery of oil and gas reserves = = = 
- 
- Large reserves of natural gas were discovered off the coast of Bintulu in 1969 . Following this , a feasibility study was done in 1975 , and Tanjung Kidurong was found to be a suitable site for a deep @-@ water port . On 14 June 1978 , Malaysia LNG Sdn Bhd ( MLNG Satu ) was established by Petronas , a Malaysian national oil and gas company for Liquefied Natural Gas ( LNG ) processing at Bintulu . On 8 July 1978 , the Bintulu Development Authority ( BDA ) was established by the Sarawak state government for infrastructure development and to promote industrial investment in the area . On 15 August 1981 , the Bintulu Port Authority was established at Tanjung Kidurong , starting operation on 1 January 1983 . Since the establishment of Sarawak Corridor of Renewable Energy ( SCORE ) in 2008 , Bintulu become the gateway to Samalajau Industrial Park , which is located 62 kilometres ( 39 mi ) away from Bintulu . The industrial park is a centre of heavy , energy @-@ intensive industry . Among the companies that started their operations in the industrial park are Tokuyama Malaysia Sdn Bhd , Press Metal Bintulu Sdn Bhd , and OM Materials Sdn Bhd . 
- Rural – urban migration is significant in Bintulu because of greater job availability in the town . Since 2007 , new residents have started several squatter areas in Bintulu due to inability to find affordable housing , around Kidurong Industrial estate and Sungai Sebatang . To address the issue , several low @-@ cost housing projects were initiated by BDA and Sarawak state government to relocate the squatters . The state government planned to achieve zero squatters status by the year 2020 . Bintulu also saw the rise in the number of residential and commercial properties such as double @-@ storeyed terraced houses , terraced shopoffices , Kidurong Commercial Centre , and Time Square Shopping Mall . Residential properties has shown a 20 % price increase from 2011 to 2013 . 
- 
- = = Governance = = 
- 
- Bintulu is represented by Bintulu parliamentary seat ( P. 217 ) in the Parliament of Malaysia . The town is also represented by three state assembly seats – Jepak , Kidurong ( later was split by two state assembly namely Tanjung Batu and Samalaju ) , and Kemena – in the Sarawak State Legislative Assembly . 
- 
- = = = Local authorities = = = 
- 
- Since 1978 the town of Bintulu has been administered by the Bintulu Development Authority ( BDA ) , with offices located along Jalan Tanjung Kidurong . The town is located within the boundary of Bintulu District , with a population of 183 @,@ 402 and a total area of 7 @,@ 220 @.@ 40 square kilometres ( 2 @,@ 787 @.@ 81 sq mi ) . Bintulu Division was formerly a Bintulu District under the jurisdiction of Miri Division . The former Bintulu District was upgraded to Bintulu Division on 1 January 1987 . At the same time , Bintulu sub @-@ district was upgraded to the present @-@ day Bintulu District . Both the Bintulu Resident and District offices are located inside Wisma Residen , Pisang Keling Street , Bintulu . 
- 
- = = Geography = = 
- 
- Bintulu is located 610 kilometres ( 380 mi ) northeast of Kuching 216 kilometres ( 134 mi ) northeast of Sibu , and 200 kilometres ( 120 mi ) southwest of Miri . Bintulu is located near the mouth of the Kemena River , in the coastal region of central Sarawak . Geology of the coastal area was formed during the Pleistocene period ; silt , clay , and gravel can be found here . Geological formation from the Oligocene period is found in the inland area , which contains limestone , siltstone , and sandstone . The soil is generally soft . 
- 
- = = = Climate = = = 
- 
- There are two monsoon seasons in the Bintulu : the northeast moonsoon ( November to March ) and the southwest moonsoon ( May to September ) . The calm period between these two moonsoons is known as the transitional period . In the coastal region , maximum rainfall occurs in the month of January , while minimal rainfall occurs from the period June to August . Rainfall is more evenly distributed in the inland areas . The annual rainfall of the Bintulu region is about 3 @,@ 750 mm ( 148 in ) annually . The mean daily hours of sunshine at Bintulu is about 5 @.@ 0 to 5 @.@ 5 hours . Bintulu receives on average 14 to 15 mJ / m2 of radiation throughout the year . Bintulu 's relative humidity is 85 % . 
- 
- = = Demographics = = 
- 
- The growth of Bintulu 's population is shown below : 
- The issue of gangsters in Bintulu was first raised in 2007 by the member of parliament ( MP ) for Bintulu . The gangsters may have run businesses related to illegal logging , controlling the prices of diesel , eggs , fertiliser and gas cylinders . Bintulu police have been cracking down on gangster activities in the town . Unscrupulous businessmen who seek cheap labour have caused a rise in the number of illegal immigrants in Bintulu . The number may have reached 50 @,@ 000 in 2009 . Bintulu immigration department has performed several operations to deport illegal immigrants back to their home country . 
- 
- = = = Ethnicity = = = 
- 
- As of the 2010 Malaysian census , the population of the town of Bintulu is 114 @,@ 058 . Indigenous people accounted for the largest proportion of the town population ( 61 @.@ 2 % , 69 @,@ 782 ) , followed by Chinese ( 25 @.@ 0 % , 28 @,@ 512 ) , Non @-@ Malaysians ( 13 @.@ 1 % , 14 @,@ 939 ) , and Indians ( 0 @.@ 28 % , 319 ) . Among the indigenous groups , there are Iban ( 32 @,@ 992 ) , Malay ( 14 @,@ 945 ) , Melanau ( 14 @,@ 179 ) , Bidayuh ( 1 @,@ 598 ) , and other indigenous tribes ( 6 @,@ 068 ) . According to government sources , there are 229 Iban longhouses in the Bintulu District . The Ibans moved into Kemena and Tatau basins in the mid @-@ 19th and early 20th century with permission of the Brooke government . Other indigenous tribes that form the minority are Kayan , Kenyah , and Punan . The Chinese in Bintulu are mainly composed of dialect groups such as Hakka , Fuzhou , and Teochews . The Chinese have been living in the town of Tatau since the era of Bruneian Empire . Later , Fuzhou Chinese from Sibu moved in , dominating the timber and plantation businesses in Bintulu . There is also a large number of foreigners working there . Most of them come from Britain , Australia , the Netherlands , Germany , South Africa , New Zealand , Japan , China , the United States , and Indonesia . 
- 
- = = = Languages = = = 
- 
- While Malay is the official language of Sarawak ; English is widely spoken there . Local ethnic languages and Chinese dialects are spoken by the respective ethnic groups . Standard Chinese is also spoken by ethnic Chinese in Bintulu . Bintulu is spoken by communities living along the Kemena River , with 4 @,@ 200 native speakers . These speakers are now recognised as part of the Melanau ethnic group , where their main language is Malay . Bintulu is classified as one of the endangered languages in Sarawak because of the isolated usage of the language in a small community . 
- 
- = = = Religion = = = 
- 
- The majority of the Bintulu population are adherents of Christian denominations due to Christian missionaries operating during the Brooke dynasty ; followed by Islam , Buddhism , and Hinduism . Among the notable places of worship in Bintulu are the Bintulu Mosque ( Masjid Assyakirin ) , Masjid Jepak , Tua Pek Kong Temple , Eng Kwang Methodist Church , and St. Thomas Church . The respective religious groups are free to hold processions in the town . 
- 
- = = Economy = = 
- 
- There are five industrial estates in Bintulu . They are : Kemena Industrial Estate ( for wood @-@ based industries ) , Jepak Industries Estate ( wood @-@ based industries ) , Kidurong Industrial Area ( for medium and light industries ) , Kidurong Light Industrial Estate ( medium and light industries ) , and Bintulu Light Industrial Estate ( light industry ) . 
- 
- = = = Oil and gas = = = 
- 
- Malaysia LNG is a Liquefied Natural Gas ( LNG ) manufacturing complex located in Bintulu that currently contains eight LNG trains with a ninth one currently under construction . The complex was built by the Malaysian national oil and gas company , Petronas . The manufacturing complex has a production capacity of 25 @.@ 7 million tonnes per annum . Petronas is also planning to open Floating LNG ( FLNG ) offshore Bintulu , which is used specifically to harvest natural gas from small and isolated gas fields . Transportation of natural gas from the neighouring state of Kimanis , Sabah , to the LNG complex at Bintulu is facilitated by a 512 @-@ kilometre ( 318 mi ) pipeline known as the " Sabah Sarawak Gas Pipeline " . Currently , 45 % of Malaysian natural gas is found at Central Luconia off the coast of Bintulu . The largest importers of Malaysia LNG productions are Japan ( 62 % ) , Korea ( 17 % ) , Taiwan ( 12 % ) , and China ( 9 % ) . 
- The Sarawak Shell Bintulu Plant ( SSBP ) , formerly known as Bintulu Crude Oil Terminal ( BCOT ) , was established in 1979 . It consists of three crude oil storage tanks , each with a capacity of 410 @,@ 000 barrels . It has three major areas of operation : Crude Oil Operations , Condensate Stabilisation , and Gas Sales Facilities . Royal Dutch Shell started to establish the world 's first Shell Middle Distillate Synthesis plant ( Shell MDS ) in 1993 . It is also known as Bintulu Gas @-@ To @-@ Liquid plant ( Bintulu GTL ) . The plant has a production capacity of 14 @,@ 770 barrels per day with a total investment of over US $ 1 billion as of the year 2010 . The plant is staffed with 380 people , of whom 93 % are Malaysians , with 80 % of the staff coming from Sarawak . 
- 
- = = = Wood @-@ based industries and plantations = = = 
- 
- Since the opening up of the Bintulu – Miri road in the 1970s , large @-@ scale plantations of oil palm and cocoa has been developed in rural areas of Bintulu Division . Currently , there are 57 @,@ 740 hectares ( 577 @.@ 4 km2 ( 223 sq mi ) ) of oil palm , 2 @,@ 000 hectares ( 200 km2 ( 77 sq mi ) ) of Rattan , and 815 hectares ( 8 @.@ 15 km2 ( 3 sq mi ) ) of pepper plantations . 
- The first Bintulu palm oil refinery , Bintulu Edible Oil Sdn Bhd , was established in June 1991 . Bintulu currently has four palm oil refineries : Bintulu Edible Oils Sdn Bhd ( operated under PGEO Group , a subsidiary of Wilmar International ) , Sime Darby Austral Edible Oil Sdn Bhd , Kirana Palm Oil Refinery Sdn Bhd , and Sarawak Oil Palm Bhd . However , as of 2015 , Wilmar no longer buys raw palm oil produced from cleared forests and peat swamps in Sarawak because of environmental concerns . 
- The Bintulu Division has been designated as a Planted Forests Zone ( PFZ ) by the Sarawak state government since 1998 . As of 30 June 2011 , a total of 124 @,@ 618 hectares ( 1 @,@ 246 @.@ 18 km2 ( 481 sq mi ) ) has been planted with acacia trees . Other trees that are planned for plantations are kelampayan , engkabang , durian , batai , eucalyptus , and rubber trees . Sarawak Planted Forest Sdn Bhd , a company wholly owned by the Sarawak state government , has been granted a license to replant forests for 60 years . However , the company has been suffering financial losses from 2009 to 2011 . 
- There are three mills in Bintulu that process wood @-@ waste products . Two are Medium @-@ density fibreboard ( MDF ) plants and the third is a charcoal briquette plant , with a total installed capacity of 246 @,@ 000 cubic metres ( 8 @,@ 700 @,@ 000 cu ft ) per year . MDF plants utilise wood waste purchased from sawmills and plywood mills in the Bintulu area and occasionally from the Tanjung Manis timber processing zone located at the mouth of the Rajang River . Synthetic resins , which are required to hold wood dust together , constituted 20 % of the total production cost of the wood panel products . MDF plants in Bintulu are operated by Daiken Sarawak Sdn Bhd , which was founded on 15 February 1994 . The briquette plant is operated by Cipta Briquette Sdn Bhd . A glue / adhesive factory in Bintulu is owned by Bintulu Adhesive & Chemicals Sdn Bhd . It produces urea formaldehyde resin and phenol formaldehyde resin for plywood and chipboard manufacturing at Kemena Industrial Estate . Urea precondensate is also produced to supply ASEAN Bintulu Fertiliser ( ABF ) plant . 
- 
- = = = Others = = = 
- 
- The Bintulu Port Authority was established in 1981 . It started port operation in 1983 at Tanjung Kidurong . Following a privatisation exercise , Bintulu Port Sdn Bhd ( BPSB ) was founded on 23 December 1992 and commenced operation on 1 January 1993 . BPA is currently responsible for regulatory exercises and security of the port . Meanwhile , BPSB is responsible for cargo handling at the Bintulu International Container Terminal ( BICT ) . The port also provides Vessel traffic service to shipping vessels . The annual total cargo throughput is 45 @.@ 4 million tonnes , consisting of 58 % LNG and 42 % non @-@ LNG products . As of 31 December 2014 , it generated a total revenue of RM 552 @.@ 3 million per year . Bintulu Port is the busiest port in Sarawak . 
- The ASEAN Bintulu Fertiliser plant is the anhydrous ammonia and granular plant operated by ASEAN Bintulu Fertiliser Sdn Bhd ( ABF ) , which is partly owned by Petronas . The company was formed on 6 December 1980 . The plant started operation on 1 October 1985 . It is also one of the largest granular urea plants in Asia . It is a joint venture by five ASEAN countries : Malaysia ( 63 @.@ 5 % shares ) , Thailand ( 13 % ) , Indonesia ( 13 % ) , the Philippines ( 9 @.@ 5 % ) , and Singapore ( 1 % ) . 
- Cahya Mata Sarawak Berhad ( CMSB ) , one of the largest publicly traded companies in Sarawak , set up a cement plant in Bintulu at Kidurong Industrial Estate . The plant , manned by 40 people , produces ordinary Portland cement and Portland blast furnace cement . It currently has a combined production capacity of 2 @.@ 75 million MT ( million tonnes ) . 
- The Bintulu combined cycle power plant was started in early 2010 with a capacity of 317 megawatts . The power plant is registered under the United Nations Clean Development Management ( CDM ) scheme as of 18 September 2010 . The plant is built to ensure efficient use of energy and reduce green house gas emissions . It is the first CDM power plant in Malaysia , currently operated by Sarawak Power Generation Sdn Bhd ( SPG ) , a wholly owned subsidiary of Sarawak Energy . 
- 
- = = Transportation = = 
- 
- 
- = = = Land = = = 
- 
- All roads in Bintulu are maintained by the Bintulu Development Authority ( BDA ) . Bintulu is connected to Miri and Sibu by the Pan Borneo Highway . Bintulu is also connected to Mukah and Samalaju Industrial Park . Kemena Bridge crosses the Kemena River . It is the second bridge in Malaysia built using the incremental launch method . Keppel Road in Bintulu is named after a friend of James Brooke , Sir Henry Keppel . 
- 
- = = = = Public transportation = = = = 
- 
- Bintulu has a long @-@ distance bus station , located at Medan Jaya , 5 km ( 3 mi ) northeast of the town centre . Among the areas served by the bus station are : Miri , Sibu , Kuching , Mukah , Sarikei , Oya , Dalat , Balingian , and Pontianak , Indonesia . The bus companies that serve the station are the Syarikat Baram Sdn . Bhd , MTC , Biaramas , and Suria bus lines . There are also buses that serve the town area . Taxi service is also available . 
- 
- = = = Air = = = 
- 
- The old Bintulu airport was built in 1955 in the town centre . It once held the Guinness World Record of nearest airport to town . On 19 December 2002 , the airport was replaced by a new airport , which is located 23 km ( 14 mi ) away from the town centre . The surroundings of the old airport were developed into commercial and residential projects while the runaway is reserved for Bintulu International Kite Festival . The new airport has a runway measuring 2 @,@ 745 m ( 9 @,@ 006 ft ) , capable of handling planes as large as the Airbus A330 . The airport currently serves three major airlines : Malaysia Airlines ( MAS ) , Air Asia , and MASwings , connecting to domestic destinations such as : Sibu , Miri , Kuching , Kuala Lumpur , and Kota Kinabalu . 
- 
- = = = Water = = = 
- 
- There is a wharf terminal at Bintulu that serves the rural areas of Bintulu Division . Among the destinations that can be reached by express boat from Bintulu are : Sebauh , Pandan , Labang , Tubau , and Binyo . 
- 
- = = Other utilities = = 
- 
- 
- = = = Courts of law and legal enforcement = = = 
- 
- The current court complex is located at Pisang Emas Road . It comprises the High Court , the Sessions Court , and the Magistrate Court . Bintulu also has Syariah Subordinate Court , located at Tanjung Kidurong , whose area of jurisdiction covers Bintulu District and Tatau districts . The Bintulu central police station is located at Tun Hussein Onn Road , with other police stations located at Tanjung Kidurong , Tubau , and Sebauh . There is also a central prison in Bintulu , which doubles as a correctional centre . 
- 
- = = = Healthcare = = = 
- 
- Bintulu Hospital started operation in 1968 . It is located at Nyabau Road , 12 km ( 7 @.@ 5 mi ) from the town centre . Following renovations completed on 21 May 2000 , the hospital is now equipped with 200 beds . As of 2011 , the hospital provides speciality services in seven medical disciplines . Bintulu also has one polyclinic , Polyclinic Bintulu . There are two private hospitals in Bintulu : Columbia Asia Hospital and Bintulu Medical Centre . 
- 
- = = = Education = = = 
- 
- There are about 50 primary and eight secondary schools in Bintulu . All the schools under the National Education System are managed by the Bintulu District Education Office . The oldest primary schools in Miri are St Anthony 's Primary School ( Roman Catholic Mission School ) , Chung Hua Primary School , and the Orang Kaya Mohammad Primary School , which were established in the early 1960s . The Bintulu Government Secondary School was built in 1964 . It is now known as SMK Bintulu , the oldest secondary school in the town . Bintulu also has one Chinese independent school , Kai Dee Middle School ( 开智中学 ) . The Shell Oil Company established the Kidurong International School in 1982 to meet the primary education needs of Shell employees ' children . The school provides English National Curriculum ( ENC ) for literacy and numeracy and International Primary Curriculum ( IPC ) for other subjects . 
- UPM Bintulu Sarawak Campus was started as the National Resource Training Centre , Kuching , in 1974 . The oldest campus in Sarawak , it was relocated to Bintulu in 1987 as a branch campus of the Universiti Pertanian Malaysia ( UPM ) . The campus was closed down in 1992 before reopening in 2001 as Universiti Putra Malaysia ( UPM ) . During this period of closure , the campus was used as the site for Maktab Perguruan Sains Bintulu ( Bintulu Science Teachers ' Training College ) from 1994 to July 1999 when it was moved to Kota Samarahan as Institut Pendidikan Guru Kampus Tun Abdul Razak ( Tun Abdul Razak Teachers ' Training Institute Campus ) . The UPM campus is currently located 13 km ( 8 mi ) away from the town centre , occupying 715 ha ( 1 @,@ 767 acres ) , which can accommodate up to 2 @,@ 200 students . This branch campus currently has only one faculty , the Faculty of Agriculture and Food Sciences , consisting of five academic departments . In 2015 UPM was ranked 41st in the UI @-@ Greenmetric World University rankings . SEDAMAI College , established in November 1999 , offers courses in business , information technology , language , and engineering . 
- There is also a technical school located 15 km ( 9 mi ) away from the town , near Tanjung Kidurong , occupying 20 ha ( 49 acres ) of land . The school was built in 1982 with a maximum capacity of 900 students . Among the courses offered are : automotive , mechanical and civil engineering , commerce , and fashion . Gulf Golden International Flying Academy ( GGIFA ) , the first and only flying academy in Sarawak , was closed in 2012 due to financial difficulties . 
- 
- = = = Libraries = = = 
- 
- The first public library in Bintulu was built in 1971 by Bintulu District Council ( BDC ) . In 1988 the library was demolished to make way for car parks . Books from the library were moved into the former BDC building . On 29 May 2000 , the Bintulu Development Authority ( BDA ) public library was built near the Bintulu Civic Centre , which is 2 km ( 1 @.@ 2 mi ) from the town . The public library has three branches : at Kidurong , Tatau , and Sebauh . 
- 
- = = Culture and leisure = = 
- 
- 
- = = = Attractions and recreational spots = = = 
- 
- 
- = = = = Cultural = = = = 
- 
- Kampung Jepak ( Jepak village ) is a Melanau fishing village in Bintulu located near Kemena River . Among the daily activities in this village are Sago processing , fish drying , and the manufacturing of Belacan , Cencaluk ( salted shrimp ) , Terendak ( Melanau headgear ) , and Tutop ( a type of food cover ) . Kuan Yin Tong temple is located at KM2 Jalan Sultan Iskandar . It has a structural design with a rock garden courtyard , man @-@ made waterfall , and dragon fencing . Assyakirin mosque , meaning " Gratefulness to God " , has a man @-@ made waterfall , a fountain , and a landscape planted with flowers . The Borneo International Kite Festival has been held yearly since 2005 at the old Bintulu airport runway . It usually lasts for four to five days in September . 
- 
- = = = = Historical = = = = 
- 
- In 1987 a clock tower and a fountain were erected at Council Negri Monument . A centenary stone that was erected in 1967 to commemorate the event is kept under the clock tower . The Bintulu Tua Pek Kong temple ( near Tamu Bintulu ) is believed to have been built in the 1890s to purge the town from evil spirits . The temple survived World War II , and was rebuilt after the discovery of oil and gas reserves offshore . 
- 
- = = = = Leisure and conservation areas = = = = 
- 
- Similajau National Park is located 30 km ( 19 mi ) northeast of the town . The park was gazetted in 1976 , covering an area of 8 @,@ 996 ha ( 22 @,@ 230 acres ) ( 89 @.@ 96 km2 ( 35 sq mi ) with sandy beaches , rocky headlands , jungle streams , and forests . Other national parks that can be accessed along the Miri – Bintulu road are the Lambir Hills National Park and Niah National Park . 
- Tanjung Batu beach ( Temasya beach ) is located 3 km ( 1 @.@ 9 mi ) from the town centre . Meanwhile , Taman Tumbina ( Tumbina Park ) is located 4 km ( 2 @.@ 5 mi ) from the town centre . The park has a hornbill aviary , a butterfly garden , and a mini @-@ zoo . 
- 
- = = = = Other attractions = = = = 
- 
- Kidurong Tower is an observation tower located at Tanjung Kidurong . It offers a view of the Bintulu oil and gas facilities shortly after nightfall . Bintulu Promenade is a 3 @-@ kilometre ( 2 mi ) walkway along the Bintulu coastline with the Kemena River mouth as its focal point . It has the observation points offering sunset views . There is also an 18 @-@ hole golf course at Bintulu . 
- 
- = = = = Shopping = = = = 
- 
- There are several shopping malls in Bintulu : ParkCity Mall , City Point , Ngiu Kee Departmental Stores , Farley shopping complex , Sing Kwong Supermarkets , and MDS @-@ Mart . Time Square Mall is currently under development in Bintulu , with Everrise as the anchor tenant . 
- Tamu Bintulu and Pasar Utama are the two main markets in the town . Both places have a unique cone @-@ shaped roof that symbolises the traditional Melanau headgear named Terendak . Tamu Bintulu offers items ranging from jungle produce to native home @-@ made specialties such as Belacan . Meanwhile , Pasar Utama houses both wet market and dry market under one roof , providing fresh vegetables , fruits , fish , and dairy products . The first floor of Pasar Utama offers a variety of fast food such as Laksa , Kolok Mee , Jawa Mee , Pulut Panggang , ais batu campur , cendol , and teh tarik . The Bintulu night market is located on Kampung Dagang road . There are over 150 stalls selling a variety of items such as garments , electric goods , vegetables , fruits , food and drinks . 
- 
- 
- = Zrinski Battalion = 
- 
- The Zrinski Battalion ( Croatian : Bojna Zrinski ) was a special forces unit of the Croatian National Guard ( Zbor narodne garde – ZNG ) and later of the Croatian Army ( Hrvatska vojska – HV ) established in Kumrovec on 18 May 1991 , during the Croatian War of Independence . The unit drew personnel from the special police forces and a former French Foreign Legion troops serving as its core . The battalion was set up and initially commanded by Ante Roso , while Major Miljenko Filipović took over as the commanding officer in August . 
- The Zrinski Battalion trained volunteer troops in Vukovar in June 1991 before it saw action in Hrvatska Kostajnica , the Battle of Gospić and near Slano in 1991 . By the end of 1991 , the unit 's personnel were tasked with setting up an additional special forces unit of the HV . The next year its elements took part in the Battle of Kupres and Operation Tiger aimed at lifting the Siege of Dubrovnik . It also and helped develop and train the Croatian Defence Council ( Hrvatsko vijeće obrane – HVO ) , setting up a training camp in Tomislavgrad . In 1993 , the battalion took part in Operation Maslenica . In February 1994 , the Zrinski Battalion was amalgamated with several other HV special forces units into the 1st Croatian Guards Brigade ( 1 @.@  hrvatski gardijski zdrug ) , a component of the 1st Croatian Guards Corps ( 1 @.@  hrvatski gardijski zbor ) . 
- 
- = = Background = = 
- 
- In 1990 , following the electoral defeat of the government of the Socialist Republic of Croatia , ethnic tensions between Croats and Croatian Serbs worsened . The Yugoslav People 's Army ( Jugoslavenska narodna armija – JNA ) believed Croatia would use the Croatian Territorial Defence Force 's ( Teritorijalna obrana – TO ) equipment to build its own army and confront the JNA itself . In order to minimize the expected resistance , the JNA confiscated the TO weapons . On 17 August , the tensions escalated into an open revolt of the Croatian Serbs . 
- In the beginning of 1991 , Croatia had no regular army . In an effort to bolster its defence , Croatia doubled the size of its police force to about 20 @,@ 000 . The most effective part of the force was the 3 @,@ 000 @-@ strong special police that were deployed in 12 battalions , adopting military organisation . In addition there were 9 @,@ 000 – 10 @,@ 000 regionally organised reserve police . The reserve police were set up in 16 battalions and 10 companies , but they lacked weapons needed to arm many of the troops . 
- Preparations to set up the Croatian National Guard ( Zbor narodne garde – ZNG ) began on 12 April 1991 . Establishment of the ZNG as a police force with military capabilities was thought necessary by the Croatian authorities following armed clashes in Pakrac and at Plitvice Lakes in March and due to the possibility of further confrontation with the JNA . The ZNG , formally established on 23 April , was tasked with protection of the constitutional order , maintenance of public order , anti @-@ terrorist operations , protection of Croatia 's borders , territory , coast and territorial waters , as well as the protection of high @-@ value structures and high @-@ profile persons . 
- 
- = = Service = = 
- 
- On 18 May 1991 , the Zrinski Battalion was established as a special forces unit of the ZNG . The core of the unit consisted of 27 volunteers drawn from the Kumrovec Special Police Unit ( SPU ) . Initially , it relied on former French Foreign Legion troops . The most senior among the former legionnaires was Ante Roso , previously a Sous @-@ Officier ( non @-@ commissioned officer – NCO ) in the 4th Foreign Regiment . In consequence , Roso was tasked with setting up the unit as its initial commander . Major Miljenko Filipović , likewise a former French Foreign Legion member , was assigned the battalions deputy commander . The unit was based in the village of Kumrovec in the region of Hrvatsko Zagorje , on the grounds of the former " Josip Broz Tito " political school . The site , adjacent to the border of Slovenia , was selected to be inaccessible to Yugoslav Air Force raids without violation of Slovene or possibly Austrian airspace . In June 1991 , the Kumrovec SPU was transferred to Sljeme Peak north of Zagreb leaving Kumrovec base to the Zrinski Battalion , as well as the second special forces unit , the Frankopan Battalion . 
- The Zrinski Battalion was deployed for the first time on 15 June . It was stationed in Vukovar , tasked with preparation of city defences and organisation of volunteer troops . In August , Filipović took over command of the battalion from Roso . The same month , the Zrinski Battalion was deployed to the Banovina , where it pushed the Croatian Serb forces out of the town of Hrvatska Kostajnica . In September , the battalion was deployed to Gospić , where it took part in battle to control Gospić against the JNA . Troops assigned to the battalion captured Kaniža barracks in Gospić . During combat in Gospić , 30 troops of the Zrinski Battalion , assisted by Lučko SPU , captured JNA Major General Trajče Krstevski , along with three armoured personnel carriers ( APCs ) and 32 soldiers . The unit was deployed to Metković on 28 October , tasked with recapturing Slano from the JNA . After the deployment to Gospić , a part of the unit personnel left to Bosnia and Herzegovina anticipating further conflict there , while the remainder of the unit returned to Kumrovec . The ZNG was renamed the Croatian Army ( Hrvatska vojska – HV ) on 3 November 1991 . In late 1991 , personnel of the Zrinski Battalion set up another special forces unit of the HV — the Matija Vlačić Battalion based in Opatija . 
- In 1992 , elements of the Zrinski Battalion took part in the Battle of Kupres , before setting up a training camp in the town of Tomislavgrad . There the battalion personnel assisted in setting up and trained the Croatian Defence Council ( Hrvatsko vijeće obrane – HVO ) . Later that year , elements of the battalion took part in Operation Tiger — aimed at lifting of the Siege of Dubrovnik . In 1993 , elements of the Zrinski Battalion took part in Operation Maslenica , fighting in the area of Škabrnja . The Central Intelligence Agency assessed the Zrinski Battalion as one of the best units of the HV . 
- 
- = = Amalgamation = = 
- 
- On 25 February 1994 , the Zrinski Battalion was amalgamated with parts of other special forces units of the HV : Frankopan Battalion , Ban Jelačić Battalion , Matija Vlačić Battalion , Ferdo Sučić Battalion and part of 8th Light Assault Brigade forming the 1st Croatian Guards Brigade ( 1 @.@  hrvatski gardijski zdrug ) , a component of the 1st Croatian Guards Corps ( 1 @.@  hrvatski gardijski zbor ) , directly subordinated to the Ministry of Defence rather than the General Staff of the Armed Forces of the Republic of Croatia . 
- 
- 
- = Weevils Wobble But They Don 't Go Down = 
- 
- " Weevils Wobble But They Don 't Go Down " is the nineteenth and penultimate episode of the third season of the American mystery television series Veronica Mars , and the 63rd episode overall . Written by Phil Klemmer and directed by Jason Bloom , the episode premiered on The CW on May 22 , 2007 . The series depicts the adventures of Veronica Mars ( Kristen Bell ) as she deals with life as a college student while moonlighting as a private detective . 
- In this episode , Weevil ( Francis Capra ) enlists Veronica 's help in proving his innocence when he is implicated in creating fake student IDs . Meanwhile , Veronica and Piz ( Chris Lowell ) come to terms with Veronica 's FBI internship , and a sex tape of this couple is released on the internet . Logan ( Jason Dohring ) beats up Piz , thinking that he posted it . In addition , Keith ( Enrico Colantoni ) and Vinnie ( Ken Marino ) debate on Piz 's radio show about the upcoming Sheriff 's election in which they are running against each other . 
- " Weevils Wobble But They Don 't Go Down " features the reappearance of Weevil after an absence of five episodes ; during filming of the third season , Capra was undergoing medical treatment . Series creator Rob Thomas pointed out Logan and Piz 's fight scene as one of the highlights of the episode and the season . In its original broadcast , the episode received mostly positive reviews from television critics , with many praising the case @-@ of @-@ the @-@ week . 
- 
- = = Synopsis = = 
- 
- In a checkout line , Veronica and Mac ( Tina Majorino ) discuss her relationship with Piz and what her FBI internship will mean for them . They then see someone getting arrested by the campus police for a fake debit card . Weevil injures himself , but finds himself unable to receive adequate benefits . Veronica gets Piz to accept her leaving for the internship . At the Sheriff ’ s station , Weevil is called out by several students as being the one who gave them the fake cards . Weevil is put in the jail cell , but he tells Veronica that he think they targeted him after he was showcased in the criminology class . She investigates several of the owners of the student IDs , who do seem genuine in accusing Weevil . Weevil gets bail , but Keith informs him that a student ID printing machine was found in the locker next to Weevil ’ s . Veronica and Wallace speak to a mechanical engineering professor , who says that it would be impossible for Weevil to be the culprit . Piz invites Keith to appear on his show in an election special before Keith informs Veronica that Weevil ’ s fingerprints were found all over the investigation . 
- Keith and Vinnie debate on Piz ’ s radio show , and Vinnie gathers some support about his lax underage drinking policy . Wallace notices someone following him , and the student wants to recruit him for a secret society . Vinnie insults Keith ’ s handling of his home life , and Veronica hits him on the head lightly . She helps Weevil retrace his steps and notices one of her criminology classmates , Jenny ( Dianna Agron ) , involved with one of Weevil ’ s clients . Veronica deduces that Jenny and her circle of friends are responsible for creating the fake student IDs , disseminating them , and implicating Weevil . Logan and Dick ( Ryan Hansen ) go surfing , and they run into Veronica , Piz , and Mac helping Wallace ( Percy Daggs III ) with his final project for a class . Veronica learns that one of Jenny ’ s group lives in the same town in Georgia where the student ID machines are made . 
- Veronica is called into a room in the library by Jenny ’ s group , who try to bribe her into not turning them in to the Hearst police . However , she denies and records their conversation as further proof . They are hiding a third ID machine , and Veronica tells Weevil to go and find it . Dick shows Logan a sex tape of Piz and Veronica that has been circulating in an email . Because Logan thinks Piz posted the tape , he attacks Piz and hits him incessantly . Weevil does have a student ID machine , and Logan walks into Mars Investigations with Piz ’ s blood on him . 
- 
- = = Production = = 
- 
- " Weevils Wobble But They Don 't Go Down " was written by Phil Klemmer and directed by Jason Bloom , marking Klemmer 's fifteenth and final writing credit and Bloom 's fourth and final directing credit for Veronica Mars , after " Green @-@ Eyed Monster " , " Nevermind the Buttocks " , and " Charlie Don 't Surf " . The episode features the final appearance of Dianna Agron , famous for her role as Quinn Fabray on Glee , as Jenny Budosh , Veronica ’ s classmate . Agron had previously appeared in the episode “ President Evil ” . The episode prominently features Weevil ( Francis Capra ) , who appears after a five episode hiatus . During the third season , Capra was struggling with a medical condition , and the medication he took for this illness caused his face and neck to swell and break out with acne . 
- Series creator Rob Thomas included the scene in which Logan attacks Piz on his list of highlights from the third season . When Thomas first viewed the scene with his wife , she emitted an audible groan when she saw Logan walking through the cafeteria towards Piz , anticipating the fight that was about to come . Thomas thought that most fight scenes on the show were not well @-@ done , as they are filmed quickly and without a second unit . However , he was pleased with the final cut , stating that there was only one punch that he thought looked fake . 
- 
- = = Reception = = 
- 
- 
- = = = Ratings = = = 
- 
- In its original broadcast , “ Weevils Wobble But They Don ’ t Go Down ” received 1 @.@ 78 million viewers , ranking 77th of 85 in the weekly rankings . This was a decrease from the previous episode , “ I Know What You 'll Do Next Summer ” , which garnered 2 @.@ 10 million watchers . 
- 
- = = = Reviews = = = 
- 
- Eric Goldman , writing for IGN , graded the episode a 9 @.@ 0 out of 10 , indicating that it was “ amazing ” . His very positive review focused on the ambiguous nature of the main plot and the subplots of Dick and Logan . He praised the presence of the majority of the main cast members , also stating that Dick ’ s reaction to dealing with his brother was realistic for him and in character . The reviewer called the conclusion to the case @-@ of @-@ the @-@ week a “ fun and satisfying one ” that highlighted the moral ambiguity of main characters , something “ that the show had been lacking for a while . ” He highlighted this ambiguity in both Weevil in Logan , elaborating that Logan attacking Piz showed an interesting side to his personality that had not been present all season . The reviewer also lauded the cliffhanger ending , stating , “ it was a reminder of how exciting and intense this show can be at its best . ” Television Without Pity did not grant the episode a rating but lauded the characterization of Weevil , stating , “ This is the most consistently written character on the show , bar none . ” 
- Kelly West of Cinema Blend focused primarily on the series finale in her review but referred to this episode as containing a solid case @-@ of @-@ the @-@ week . “ Overall it was a good mystery @-@ of @-@ the @-@ week but seeing as it was the second to last episode , the only thing I cared about was that we finally got more Weevil ! ” Rowan Kaiser of The A.V. Club gave the episode a mixed review , stating that the case @-@ of @-@ the @-@ week was an oversimplification of the overall themes of Veronica Mars . The reviewer enjoyed the potential for Logan and Dick ’ s character development , although he felt that it was blunted by the knowledge that the series was about to end . “ As fun as this episode is , the lack of the fourth season stops me from fully enjoying it . [ … ] But the knowledge that this might be the last time we see them puts a damper on things . Still , better to go out with good episodes than bad . ” 
- 
- 
- = Temple Beth Israel ( Eugene , Oregon ) = 
- 
- Temple Beth Israel ( Hebrew : בית ישראל ) is a Reconstructionist synagogue located at 1175 East 29th Avenue in Eugene , Oregon . Founded in the early 1930s as a Conservative congregation , Beth Israel was for many decades the only synagogue in Eugene . 
- The congregation initially worshipped in a converted house on West Eighth Street . It constructed its first building on Portland Street in 1952 , and occupied its current LEED @-@ compliant facilities in 2008 . 
- In the early 1990s conflict between feminist and traditional members led to the latter leaving Beth Israel , and forming the Orthodox Congregation Ahavas Torah . Beth Israel came under attack from neo @-@ Nazi members of the Volksfront twice , in 1994 and again in 2002 . In both cases the perpetrators were caught and convicted . 
- Services were lay @-@ led for decades . Marcus Simmons was hired as the congregation 's first rabbi in 1959 , but left in 1961 . After a gap of two years , Louis Neimand became rabbi in 1963 , and served until his death in 1976 . He was followed by Myron Kinberg , who served from 1977 to 1994 , and Kinberg in turn was succeeded by Yitzhak Husbands @-@ Hankin . Maurice Harris joined Husbands @-@ Hankin as associate rabbi in 2003 , and served until 2011 , when he was succeeded by Boris Dolin . As of 2014 , led by Husbands @-@ Hankin and Dolin , Beth Israel had approximately 400 member households , and was the largest synagogue in Eugene . 
- 
- = = Early history = = 
- 
- Small numbers of German Jews began settling in Eugene in the late 19th century , but most moved on . In the early 20th century the first Eastern European Jews settled there , and by the 1920s Eugene 's Jewish community began gathering prayer quorums for holding Friday night and Jewish holiday services in individuals ' homes . Historian Steven Lowenstein writes that " [ a ] fter Hymen Rubenstein 's death in 1933 , his home at 231 West Eighth Street was remodeled and named Temple Beth Israel " . It was a traditional Conservative synagogue , and from that time until the 1990s it was the only synagogue in Eugene . 
- In 1952 , the congregation constructed a one @-@ story synagogue building on an almost 1 acre ( 0 @.@ 40 ha ) property at 2550 Portland Street . Designed by architect and Holocaust @-@ survivor Heinrich Hormuth ( H.H. ) Waechter , the building featured an interior courtyard that provided natural lighting , and " a network of ceiling beams painted with symbols and shapes " by Waechter . 
- Temple Beth Israel 's services and religious functions were lay @-@ led for decades . Its first rabbi was Marcus Simmons . Originally from England , he was a graduate of University of London and Oxford University , and was ordained at the Hebrew Theological Seminary . He emigrated to the United States in 1957 , and joined Beth Israel in 1959 . The members were not , however , agreed that a full @-@ time rabbi was required , and in 1961 , he accepted a rabbinical position in Downey , California . 
- Following a hiatus of two years , Louis Neimand was hired as rabbi in 1963 . Born in New York City in 1912 to immigrant parents , he was a graduate of City University of New York and was ordained at the Jewish Institute of New York . He had previously worked for the United Jewish Appeal , and from 1959 to 1963 was the first Hillel rabbi at Syracuse University . There was some concern about Neimand 's hiring , as he had a police record as a result of his involvement in freedom marches in the African @-@ American Civil Rights Movement ( 1955 – 1968 ) . He served until his death in 1976 . 
- 
- = = Kinberg era = = 
- 
- Myron Kinberg was hired as rabbi in 1977 . Ordained in Reform Judaism , he had previously served as a rabbi in Topeka , Kansas for two years , then lived in Israel for two years , before coming to Eugene . Kinberg was known for his support for minority rights and gay rights , anti @-@ nuclear and anti @-@ war activism , support of reconciliation between Israel and the Palestinians , and outreach to non @-@ observant members of Eugene 's Jewish community . 
- Kinberg attempted to revive the Biblical concept of the " ger toshav " in his approach to intermarriage . He was willing to officiate at an intermarriage if the non @-@ Jewish partner , after discussions with the rabbi , agreed of his or her own free will to fulfill a set of commitments , including " a commitment to a Jewish home life , participation in Jewish life and tradition , and raising future children as Jews " . The non @-@ Jewish partner making this commitment became a " ger toshav " , or " non @-@ Jewish member of the Jewish people " . 
- Kinberg 's wife Alice was a strong feminist , and during the 1980s he and his wife supported a number of changes to the liturgy and ritual . These included allowing women to read from the Torah and lead the prayers , and changing prayers to be more gender inclusive - for example , using gender @-@ neutral terms and pronouns for God , and adding references to the Biblical matriarchs in prayers like the Amidah , which traditionally only mentioned the Biblical patriarchs . While most congregation members approved of these changes , a minority resisted them . 
- 
- = = = Schism = = = 
- 
- By the early 1990s serious divisions developed among the members of the congregation over a number of issues , including personal antagonisms , the rabbi 's activism and " advocacy of ' ultra @-@ liberal ' causes " , political differences over the Israeli – Palestinian conflict , and 
- a myriad of additional Jewish cultural / religious issues , such as the acceptance of intermarried couples , adherence to kosher dietary laws , the use of modern language and music during worship services , rewriting of certain prayers such as the Aleynu to make them less ethnocentric , and so on . 
- However , the biggest source of division , which underlay all others , was " the roles and rights of men and women in the synagogue . " 
- In the early 1990s a group of newly observant members began holding more traditional services in a back room of the synagogue , complete with a mechitza , a partition separating men and women . The " more feminist @-@ minded " members strongly objected to having a mechitza anywhere in the Temple Beth Israel building , even if it were not in the services they attended . The latter group eventually circulated a petition which stated that either the mechitza would have to be taken down , or those members who wanted it would have to leave . Kinberg also signed the petition . Faced with this opposition , in 1992 the Orthodox members left , renting new premises and hiring their own rabbi , creating Eugene 's second synagogue , originally called " The Halachic Minyan " , and in 1998 renamed " Congregation Ahavas Torah " . 
- Kinberg held himself responsible , and the schism led to his " reassessment of the needs of Temple Beth Israel and his role as a rabbi " . As a result , he left Beth Israel in 1994 to lead a synagogue on Long Island . During his tenure at Beth Israel , membership rose from 118 to 350 families . Kinberg died two years later at age 51 . 
- 
- = = Husbands @-@ Hankin era = = 
- 
- Yitzhak Husbands @-@ Hankin succeeded Kinberg in 1995 . Husbands @-@ Hankin began his involvement at Temple Beth Israel first as a congregant , then as cantor , and then as an assistant rabbi . He was active in forming the Jewish Renewal movement , and was ordained by its leader Zalman Schachter @-@ Shalomi . 
- The congregation decided to leave the Conservative movement in 1995 , and for a year had no affiliation . In late 1996 , after considering both Reform and Reconstructionist as alternatives , the congregation affiliated with the Reconstructionist movement . By 1999 , membership had grown to around 370 families . 
- Husbands @-@ Hankin was instrumental in developing the concept of " Ethical Kashrut " , the idea that one should only purchase goods that are produced in an ethical way . His essay , " Ethical Kashrut , " was selected for publication in Arthur Kurzweil 's Best Jewish Writing 2003 . A singer , cello and guitar player , he composes and performs Jewish music . 
- Husbands @-@ Hankin has had four assistant or associate rabbis working with him . Shoshana Spergel joined Temple Beth Israel in 1998 as interim rabbi when Husbands @-@ Hankins went on a sabbatical ; Jonathan Seidel was assistant rabbi from 2001 to 2003 . Maurice Harris , a 2003 graduate of the Reconstructionist Rabbinical College , joined as assistant rabbi in 2003 . He is one of the signators of The Open Letter Concerning Religion and Science From American Rabbis , part of the Clergy Letter Project which " encourages and embraces the teaching of evolution in schools " . In 2011 , Boris Dolin joined the congregation as its newest associate rabbi . 
- 
- = = = Attacks by neo @-@ Nazis = = = 
- 
- On March 20 , 1994 , Chris Lord , an individual associated with the Volksfront and American Front , fired ten rounds with an assault rifle into the temple , damaging the interior . The attacks were prompted by a newspaper article about several members of Eugene 's Jewish community , including a lesbian . Community organizations , including a local gay rights group , responded by standing vigil outside the synagogue during Passover services . Lord and an associate were caught and convicted , and Lord was sentenced to four and a half years in prison . 
- On October 25 , 2002 Jacob Laskey , his brother Gabriel Laskey , Gerald Poundstone , Jesse Baker , and one other man , all members of the Volksfront , drove to Beth Israel with the intent of intimidating the congregants . While a service with 80 members attending was taking place , the men threw rocks etched with Nazi swastikas through the synagogue 's stained glass windows , then sped off . The men were caught , pleaded guilty , and were convicted . They served sentences ranging from a 6 @-@ month work release term and five years probation , to eleven years and three months in federal prison for the ringleader , Jacob Laskey . 
- 
- = = = East 29th Avenue building = = = 
- 
- Originally sized for 75 families , Temple Beth Israel 's Portland Street building had been renovated and enlarged over the years to 7 @,@ 500 square feet ( 700 m2 ) to accommodate 250 families and 150 students . Despite these additions and the loss of members to Congregation Ahavas Torah , the synagogue was not large enough , particularly during the High Holidays , when extra space had to be rented . In 1997 the congregation purchased the property of the University Street Christian Church for $ 500 @,@ 000 ( today $ 740 @,@ 000 ) , and began planning for a new facility . The members considered renovating the existing building on the property , but felt a new building would better suit their requirements , and razed the church . 
- In 2003 the congregation got a permit to begin construction of a new facility on the now @-@ vacant 1 @.@ 37 @-@ acre ( 0 @.@ 55 ha ) plot of land at the northwest corner of East 29th Avenue and University Street . An initial capital campaign raised more than $ 1 @.@ 8 million , which fully paid for the land , and by August 2007 an additional $ 1 @.@ 7 million had been raised towards anticipated overall project costs of $ 5 million . 
- The environmentally sensitive building was designed by Mel Solomon and Associates of Kansas City and local company TBG Architects & Planners , and built by McKenzie Commercial Construction of Eugene . The building used " energy efficient heating , ventilation and lighting " : specific design issues with the building 's energy efficiency included the fact that the largest room in the building , the sanctuary , was also the least @-@ used , and , in accord with Jewish tradition , had to face east ( towards Jerusalem ) . 
- On June 8 , 2008 the congregation dedicated its new building at 1175 East 29th Avenue . At approximately 25 @,@ 000 square feet ( 2 @,@ 300 m2 ) , the facility included a sanctuary , commercial kitchen , banquet facilities , and classrooms , and housed the synagogue , the Lane County Jewish Federation , and the local Jewish Family Service . The project ended up costing $ 6 million , of which $ 4 million had been raised . 
- Made of concrete , steel , and wood , the building achieved Leadership in Energy and Environmental Design compliance " through the integration of stormwater management strategies , high efficiency irrigation , the use of recycled and / or recyclable materials , and drought tolerant plantings . " Completely recyclable materials used in the structure included carpeting and wood beams . 
- 
- = = Recent events = = 
- 
- In 2008 , Temple Beth Israel participated in Banners Across America , an " interfaith witness against torture coordinated by the National Religious Campaign Against Torture , " as part of the Jewish Campaign Against Torture . Organized by Rabbis for Human Rights — North America in honor of Torture Awareness Month , the Jewish campaign included over 25 synagogues which hung banners protesting " the use of abusive interrogation techniques by the American military and intelligence community " . That year , congregational membership reached almost 400 families , and the Talmud Torah and pre @-@ school had about 200 and 40 students respectively . 
- The congregation sold the old synagogue building on Portland Street to Security First ( Portland Street ) Child Development Center for $ 815 @,@ 000 in 2009 , carrying the Center 's financing . The building was converted for use as an educational center , while retaining some of the original architectural elements . Difficult economic conditions forced the Child Development Center to give up the building in 2011 , and Eugene 's Network Charter School planned to move into it in autumn 2011 . 
- Harris announced he would be stepping down as rabbi in 2011 , and the synagogue hired Boris Dolin as his successor . Born and raised in Oregon , Dolin had worked at Temple Beth Israel as a teacher and youth group adviser from 1999 to 2001 . A graduate of the University of Oregon , with a master 's degree in Jewish Education from the Jewish Theological Seminary , he was ordained by the Reconstructionist Rabbinical College . 
- As of 2011 , Temple Beth Israel was the largest synagogue in Eugene . It was a member of the Community of Welcoming Congregations , " an Oregon and SW Washington interfaith ministry and advocacy organization working toward full inclusion and equality for transgender , lesbian , bisexual , gay and questioning persons . " The rabbis were Yitzhak Husbands @-@ Hankin and Boris Dolin . 
- 
- 
- = New York State Route 93 = 
- 
- New York State Route 93 ( NY 93 ) is a 43 @.@ 08 @-@ mile ( 69 @.@ 33 km ) state highway in western New York in the United States . The route begins at an intersection with NY 18F in the village of Youngstown and runs in a general northwest – southeast direction across Niagara and Erie counties to its east end at an intersection with NY 5 in the town of Newstead , just south of the village of Akron . NY 93 serves as a connector between several major arterials , including NY 104 in Cambria , NY 31 just west of the city of Lockport , and NY 78 south of the city . 
- The route was assigned as part of the 1930 renumbering of state highways in New York . Although it began in Youngstown and ended in Newstead as it does today , the initial routing of NY 93 deviated from the modern path in the vicinity of the city of Lockport . From Cambria to Lockport 's eastern suburbs , the highway originally used NY 425 , Lower Mountain Road , Akron Road , and a series of streets in Lockport . NY 93 was moved onto NY 104 and Junction Road in Cambria in the 1940s , and altered to bypass Lockport to the south on a new highway and Robinson and Dysinger roads in 1991 . In 2006 , NY 93 was realigned west of Lockport to continue south on Junction Road to NY 31 . The change removed NY 93 from Upper Mountain Road , a county @-@ owned highway that had been part of the route since the 1930s . 
- 
- = = Route description = = 
- 
- 
- = = = West of Lockport = = = 
- 
- NY 93 begins at an intersection with NY 18F ( Main Street ; co @-@ designated but not signed as County Route 907 or CR 907 ) in the center of the village of Youngstown . The route proceeds eastward through the village as a two @-@ lane road named Lockport Street , serving two blocks of commercial areas before bending to the northeast and passing into the residential eastern portion of Youngstown . At the eastern village limits , NY 93 briefly widens to four lanes as it enters a partial cloverleaf interchange with the Niagara Scenic Parkway . Past the junction , the highway reverts to a two @-@ lane road and changes names to Youngstown – Lockport Road as it runs across the town of Porter . The residential surroundings continue to the hamlet of Towers Corners , where NY 93 connects to NY 18 ( Creek Road ) . 
- After NY 18 , NY 93 curves to the southeast , serving another residential stretch ahead of a junction with Youngstown – Wilson Road ( CR 36 ) on the eastern edge of Towers Corners . After this intersection , the homes give way to farms as the road heads into rural areas of the town . The route continues on a southeast track through Porter , passing a mixture of rural and residential areas on its way into the hamlet of Porter Center , where NY 93 enters an intersection with Porter Center Road ( CR 57 ) . Another southeastward stretch brings the route across Twelvemile Creek and into the hamlet of Ransomville , where NY 93 becomes the community 's main street . Through Ransomville , NY 93 retains the Youngstown – Lockport Road name , intersecting with Ransomville Road ( CR 17 ) in the hamlet 's business district . 
- Just outside Ransomville , NY 93 leaves the town of Porter for the town of Wilson . It continues generally southeastward across mostly open terrain , meeting Randall Road ( CR 83 ) and Church Street ( CR 56 ) on its way to the town of Cambria . NY 93 becomes North Ridge Road at the town line , and it soon enters the hamlet of North Ridge , a community built up around the route 's intersection with NY 425 ( Cambria – Wilson Road ) . The hamlet 's residential surroundings continue to the adjacent community of Molyneaux Corners , where NY 93 becomes concurrent with NY 104 ( Ridge Road ) . NY 93 and NY 104 proceed northeast across lightly populated areas for 2 miles ( 3 @.@ 2 km ) to the hamlet of Warren Corners , at which point NY 93 splits from NY 104 and heads southward along Town Line Road . It immediately intersects with Stone Road ( CR 19 ) before leaving the hamlet . 
- 
- = = = Lockport area = = = 
- 
- Outside of Warren Corners , the route heads across rural areas along the Cambria – Lockport town line . It soon enters the small hamlet of Hickory Corners , where the road passes under Lower Mountain Road ( CR 902 ) . Access to the highway is made by way of Town Line Road Spur ( CR 114 ) , a connector leading to Lower Mountain Road . NY 93 continues southward along the town line , changing names to Junction Road at an intersection with Upper Mountain Road ( CR 5 ) west of the city of Lockport . From here , the route crosses over CSX Transportation 's Lockport Subdivision rail line at the hamlet of Lockport Junction before intersecting with NY 31 ( Saunders Settlement Road ) and NY 270 ( Campbell Boulevard ) just south of the community . NY 270 begins straight ahead to the south while NY 93 turns northeast onto Saunders Settlement Road , beginning a concurrency with NY 31 . 
- Now fully in the town of Lockport , NY 31 and NY 93 proceed northeast through an open area of the town as a four @-@ lane divided highway . The two routes continue to the western edge of the city of Lockport , where they intersect with Upper Mountain Road and the Lockport Bypass . The overlap ends here as NY 93 turns southeastward onto the two @-@ lane bypass . Along the bypass , NY 93 briefly enters the city limits as it runs past several industrial facilities and intersects with Hinman Road ( CR 903 ) just ahead of a bridge over the Erie Canal . Past the waterway , the bypass takes a more southerly course through an undeveloped part of the town of Lockport to a junction with Robinson Road ( CR 123 ) on the Lockport – Pendleton town line . The Lockport Bypass ends here , leaving NY 93 to turn eastward onto Robinson Road . 
- The route initially serves a line of homes as it heads along Robinson Road ; however , it soon enters a commercial district surrounding the road 's intersection with NY 78 ( Transit Road ) . At this point , the Lockport – Pendleton town line turns south to follow NY 78 , leaving NY 93 fully within the town of Lockport as it runs eastward past another stretch of homes . Not far from NY 78 , NY 93 changes names to Dysinger Road at an intersection with Beattie Avenue ( CR 14 ) and Raymond Road ( CR 85 ) . The junction also marks a shift in the road 's surroundings as the homes give way to open , rolling terrain . NY 93 continues eastward for several miles to the town of Royalton , where it meets Riddle Road ( CR 35 ) and Akron Road ( CR 142 ) at adjacent intersections just east of the town line . 
- 
- = = = East of Lockport = = = 
- 
- NY 93 takes over Akron Road 's name and right @-@ of @-@ way , continuing eastward past a line of scattered homes to reach the sparsely developed hamlet of Dysinger . Here , the route turns southward at a junction with Bunker Hill Road ( CR 136 ) . Outside of Dysinger , NY 93 heads southeastward across undeveloped areas of Royalton , connecting to Block Church Road ( CR 110 ) as it approaches Tonawanda Creek and the Niagara – Erie county line . The road runs along the northern edge of the creek for about 1 @.@ 5 miles ( 2 @.@ 4 km ) prior to curving southward at an intersection with Wolcottsville Road ( CR 122 ) . The turn brings NY 93 across Tonawanda Creek and into the Erie County town of Newstead , where it becomes known as Maple Road and immediately intersects with CR 260 ( Koepsel Road ) . 
- Continuing southward , NY 93 runs across open , rolling terrain , meeting CR 259 ( Tonawanda Creek Road ) on its way to the hamlet of Swifts Mills . Here , the rural surroundings briefly give way to residential areas as NY 93 intersects with CR 255 ( Swift Mills Road ) in the center of the community . South of Swifts Mills , the road serves only intermittent stretches of homes for 2 miles ( 3 @.@ 2 km ) , including a cluster of residences around its closely spaced intersections with CR 253 ( Carney Road ) and CR 42 ( Rapids Road ) . It continues on a southward track past the eastern terminus of CR 218 ( Hunts Corner – Akron Road ) to the outskirts of the village of Akron , where the highway turns east onto Lewis Road and soon enters the village limits . NY 93 runs past a line of homes before intersecting Cedar Street , a road maintained by Erie County as CR 261 north of the village . 
- The route turns south at Cedar Street , following the residential street into downtown Akron . Here , NY 93 intersects with CR 573 ( John Street ) at a junction that was once the western terminus of NY 267 . At this intersection , NY 93 heads west on John Street for one block before continuing south on Buffalo Street for another block to Main Street . NY 93 turns westward again , following Main Street through the westernmost part of Akron 's central business district prior to curving southwestward at a junction with Mechanic Street . The highway takes on the Mechanic Street name as it crosses over Murder Creek and leaves downtown Akron . Just south of the creek , NY 93 changes names to Buell Street at an intersection with Jackson Street . 
- As the route continues southward through the southern part of Akron , it serves mostly residential areas , save for an industrial complex at NY 93 's intersection with CR 163 ( Clarence Center Road ) and CR 167 ( Parkview Drive ) . NY 93 exits Akron a short distance south of the junction , at which point the route heads into another area of open fields while retaining the Buell Street name . It continues on a southward track for about 1 mile ( 1 @.@ 6 km ) to a commercialized intersection with NY 5 ( Main Road ) , where Buell Street and NY 93 both come to an end . 
- 
- = = History = = 
- 
- 
- = = = Designation and early changes = = = 
- 
- NY 93 was established as part of the 1930 renumbering of state highways in New York , connecting the cities and villages of Youngstown , Lockport , and Akron . While the termini of NY 93 have remained the same to this day , several portions of the route have been realigned since that time . When NY 93 was first assigned , it turned south at the hamlet of North Ridge and overlapped with NY 425 along Cambria – Wilson Road to Lower Mountain Road , then part of NY 3 . NY 425 went west from this junction while NY 93 headed eastward , following NY 3 along Lower Mountain , Gothic Hill , Upper Mountain , and Saunders Settlement roads to the city of Lockport . At Locust Street , NY 93 left NY 3 and exited the city along Locust , High , and Akron streets and Akron Road . It met its current alignment southeast of the city in Royalton . 
- NY 3 was realigned c . 1932 to follow Saunders Settlement Road between Shawnee Road ( NY 425 ) and Upper Mountain Road . The former routing of NY 3 along Shawnee , Lower Mountain , Gothic Hill , and Upper Mountain roads was redesignated as NY 3A even though all of NY 3 's former routing was already part of either NY 425 or NY 93 . The NY 3A designation was eliminated c . 1935 when NY 3 was truncated eastward to a new western terminus in central New York . In the early 1940s , NY 93 was altered to follow North Ridge Road , U.S. Route 104 ( now NY 104 ) , and Junction Road between North Ridge and Lower Mountain Road . 
- Around the same time that NY 93 was rerouted , NY 270 was also extended northward along Junction Road from NY 31 to US 104 . As a result , NY 93 overlapped NY 270 between Lower Mountain Road and US 104 . The overlap with NY 270 remained in place until c . 1963 when NY 270 was truncated southward to the intersection of Lower Mountain and Junction roads . NY 93 was realigned in the late 1970s to bypass Lower Mountain and Gothic Hill Roads on Junction and Upper Mountain roads , replacing NY 270 along Junction Road . The Lower Mountain Road portion of NY 93 's former routing is now maintained by Niagara County as County Route 902 ( CR 902 ) . 
- 
- = = = Lockport realignments = = = 
- 
- The Lockport Bypass , a highway bypassing downtown Lockport to the southwest , was opened to traffic on July 26 , 1991 . The highway cost $ 7 @.@ 7 million ( equivalent to $ 13 @.@ 4 million in 2016 ) to construct and extended from the junction of NY 31 and NY 93 west of the city to Robinson Road south of downtown . NY 93 was realigned to follow the new bypass south to Robinson Road , where it turned east and followed Robinson Road ( CR 123 ) and Dysinger Road ( CR 133 ) to Akron Road in Royalton . The portion of Akron Road ( NY 93 's former routing ) east of the Lockport city limits became NY 954M , an unsigned reference route . 
- Ownership and maintenance of Robinson Road from the bypass to NY 78 was transferred from Niagara County to the state of New York on September 1 , 1990 , as part of a highway maintenance swap between the two levels of government . The portion of NY 93 between NY 78 and Akron Road became state @-@ maintained on October 1 , 1998 , as part of another swap that also transferred ownership and maintenance of Akron Road to Niagara County . Akron Road is now CR 142 . 
- On November 1 , 2005 , the Niagara County Legislature voted on a measure to allow the county to ask the New York State Department of Transportation ( NYSDOT ) to remove the NY 93 designation from Upper Mountain Road , a county @-@ maintained highway , and reassign it to Junction Road ( NY 270 ) and Saunders Settlement Road ( NY 31 ) . The impetus for the change came from a resident of Upper Mountain Road , who demanded that trucks should be removed from the roadway . This part of the agenda was passed . NYSDOT obliged to the request in 2006 , rerouting NY 93 as proposed and truncating NY 270 southward to NY 31 . 
- 
- = = Major intersections = = 
- 
- 
- 
- = Operation USA = 
- 
- Operation USA ( OpUSA , Operation California , or OpCal ) is a non profit humanitarian organization dedicated to helping communities alleviate the effects of disaster , disease , and endemic poverty throughout the world by providing privately funded relief , reconstruction , humanitarian aid and development aid . It is exclusively privately funded , receiving no assistance from the United States Federal Government . OPUSA had a revenue of over $ 22 million in fiscal year 2012 and has shipped over $ 425 million worth of " high @-@ priority medical , nutritional and shelter supplies " since its inception , including shipments to Haiti , Japan , Chile , Kenya and Pakistan in 2011 and 2011 . 
- 
- = = Awards and affiliations = = 
- 
- Operation USA was part of the International Campaign to Ban Landmines in 1997 when it won the Nobel Peace Prize . Operation California was also the winner of the 1983 President 's Volunteer Action Award . Operation USA has been named one of America 's Best 100 Charities by Worth Magazine and , in October 2008 , was named the top @-@ rated " exclusively privately funded charity in the U.S. " by Charity Navigator . Operation USA collaborated with NASA 's Jet Propulsion Laboratory and the US National Laboratories at Lawrence Livermore and Los Alamos to develop new approaches to land mine detection , is a member of InterAction , and is an AlertNet news partner . In 2014 Operation USA 's CEO Richard M. Walden received the Honeywell Hometown Hero Award from the Honeywell Corp. 
- 
- = = History = = 
- 
- Operation California began in 1979 as " a relief organization created to provide aid to Vietnamese Boat People and Cambodian refugees " , founded by Richard Walden ( still active as President & CEO ) and Llewellyn Werner ( who left in early 1980 ) . The organization flew " the first international relief airlift to Cambodia since 1975 " , delivering medicine to Phnom @-@ Penh . Operation California had airlifted more than $ 3 million worth of aid by October 1979 . 
- Since then , Operation USA has become a highly acclaimed aid organization that is involved in helping people in different ways around the world . In 1982 , Operation California sent " the first private airlift from the U.S. to Poland " , delivering 200 @,@ 000 lbs. of medical supplies and medicine ; that year Operation California also airlifted medical supplies to Lebanon . In 1983 , Operation California delivered aid to the children of Vietnam and Cambodia . Operation California provided aid to the earthquake victims in Mexico City in 1985 , as well as working in cooperation with the Unitarian Universalist Service Committee and Oxfam America , to deliver $ 250 @,@ 000 worth of medical aid to Nicaragua . In 1986 Operation California , in conjunction with Medical Aid to El Salvador , sent " [ t ] wo cargo planes carrying $ 500 @,@ 000 worth of relief supplies to earthquake @-@ stricken El Salvador " . 
- In 1988 , Operation California began using the name Operation USA because it better described the effort and intent of the organization to represent the entire American people . In 1989 Operation USA facilitated operations on children in Vietnam who had cleft palates by a Los Angeles @-@ based plastic surgeon , Dr Stanley Frileck . Medical aid effort was delivered to Mexico in 1990 , by OPUSA in conjunction with USSR relief workers . In 1991 OPUSA delivered aid to Bangladesh . OpUSA delivered aid to war torn Somali 's in 1993 . In 1994 OpUSA provided earthquake relief . In 1995 the organization provided aid to Hurricane Mitch survivors in Honduras and Nicaragua . In 1999 OpUSA supplied aid to storm victims in Mexico . In 2003 OpUSA delivered aid to Iraq War victims in the Persian Gulf . The tsunami victims in Sri Lanka and Indonesia were aided by OpUSA in 2004 , as well as the Mexico City Flood victims . 
- In 2008 , OpUSA has delivered aid to Myanmar cyclone victims as well as Chinese earthquake victims and flood victims in the Midwest , USA . 
- In 2015 , OpUsa partners with UniversalGiving to raise fund for its project , which is to deliver recovery aid to Nepal Earthquake victims . 
- 
- = = Celebrity affiliates = = 
- 
- Operation USA , since the early 1980s , has relied on fundraising efforts featuring singers and celebrities . These include concerts , dinners , and other events . These promotions have featured : 
- Barbra Streisand 
- Bonnie Raitt 
- Carol Burnett 
- Crosby , Stills & Nash 
- Don Henley 
- Ed Asner 
- Frank Sinatra 
- Jack Elliot 
- Jackson Browne 
- James Garner 
- John Denver 
- Julie Andrews 
- Kirk Douglas 
- Michael Jackson 
- New American Orchestra 
- Plácido Domingo 
- Ricardo Montalban 
- Ry Cooder 
- Sharon Stone 
- The Buena Vista Social Club 
- Tony Adams 
- Rosario Dawson travelled with Operation USA to Nicaragua in 2008 . George Hamilton assisted with relief to The Philippines Typhoon Haiyan in 2013 as did Barbra Streisand , Rosario Dawson , Jackson Browne , Bill Maher and Judd Apatow . 
- 
- = = Film and theater projects = = 
- 
- Operation USA also relies on film and theater promotions to generate funds that pay for aid , including : 
- Because We Care ( CBS Television Special ) 
- Beyond Borders ( Hollywood ) 
- Buena Vista Social Club ( film ) ( Hollywood & Havana ) 
- Fidel ( film ) ( Hollywood for Showtime ) 
- Mary Poppins ( musical ) ( London stage ) 
- Miss Saigon ( Hollywood ) 
- Roll Bounce 
- The Killing Fields ( film ) ( Hollywood & Cambodia ) 
- Victor / Victoria ( Broadway ) 
- Eloise at The Plaza ( Disney TV ) 
- 
- 
- = Typhoon Krosa ( 2013 ) = 
- 
- Typhoon Krosa , known in the Philippines as Typhoon Vinta , was a typhoon that made landfall in the northern Philippines in late October 2013 . Forming on October 27 near Guam , the storm slowly intensified while moving westward . Krosa developed an eye and became a typhoon before striking Luzon on October 31 . The storm weakened over land , but re @-@ intensified over the South China Sea , reaching peak winds of 150 km / h ( 90 mph ) on November 2 off the southeast coast of China . Typhoon Krosa stalled and encountered unfavorable conditions , resulting in quick weakening . By November 3 , it had weakened to tropical storm status , and was no longer being warned on by the next day . In northern Luzon , Krosa damaged 32 @,@ 000 houses , including 3 @,@ 000 that were destroyed , and caused four fatalities . High winds and rainfall left P277 million ( PHP , $ 6 @.@ 4 million USD ) in damage . 
- 
- = = Meteorological history = = 
- 
- On October 27 , an area of convection with a broad circulation persisted southeast of Guam , and slowly consolidated due to moderate wind shear and westerly outflow . That day , the Japan Meteorological Agency ( JMA ) classified the system as a tropical depression about 380 km ( 235 mi ) to the southeast of Hagåtña , Guam . At 2100 UTC on October 28 , the Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) began issuing advisories on the depression , giving it the local name Vinta . The next day , the JMA upgraded the depression to Tropical Storm Krosa ( 1329 ) , and the Joint Typhoon Warning Center ( JTWC ) also classified it as Tropical Depression 29W . By that time , the storm was moving steadily westward due to the subtropical ridge to the north . With the warm waters of the Philippine Sea , lessening wind shear , and improving outflow , Krosa gradually strengthened , and the JTWC also upgraded Krosa to tropical storm status on October 30 after an eye feature developed . 
- While approaching northern Luzon on October 29 , Krosa quickly intensified as the initial eye feature organized into a well @-@ defined eye . Late on October 30 , the JTWC upgraded Krosa to typhoon status , and the next day , both PAGASA and JMA followed suit . On October 31 , Krosa made landfall in northeastern Luzon near Cagayan , and developed a symmetric eyewall while initially moving over land . Land interaction weakened the eye by the time Krosa emerged into the South China Sea late on October 31 . The next day , PAGASA discontinued advisories after the typhoon exited the region . Convection rebuilt around the center , with continued favorable conditions allowing for restrengthening . 
- Late on November 1 , a large eye redeveloped , and the JTWC estimated Krosa attained peak 1 minute sustained winds of 185 km / h ( 115 mph ) . Early the next day , the JMA also estimated the typhoon reached peak 10 minute winds of 140 km / h ( 85 mph ) . Later , increasing wind shear caused the eye to deteriorate , and Krosa began slowing about 260 km ( 160 mi ) east @-@ southeast of Hong Kong after reaching the western edge of the subtropical ridge . After remaining nearly stationary , Krosa began moving steadily to the west @-@ southwest due to a new ridge . The convection continued to weaken due to continued shear and cooler waters from upwelling , and Krosa deteriorated to tropical storm status on November 3 . The next day , the JTWC issued its final advisory after the circulation became exposed from the convection . Also on November 4 , the JMA downgraded Krosa to tropical depression status off the northeast Vietnam coast . The system dissipated at 0000 UTC on November 5 . 
- 
- = = Preparations and impact = = 
- 
- Before Krosa struck the Philippines , PAGASA issued a number 3 warning signal for portions of northern Luzon , where winds were expected to reach over 100 km / h ( 60 mph ) . The agency noted for the potential for flooding and landslides . High winds knocked down trees across Luzon , and left about 80 % of Cagayan province without power , as well as some areas without internet or cellphone service . Portions of the Pan @-@ Philippine Highway were blocked , and in Lal @-@ Lo , Cagayan , a car crashed into a gasoline truck due to power outages . Agriculture damage was estimated at P273 million ( PHP , $ 6 @.@ 3 million USD ) , occurring just before the start of the harvest . Across the island , the typhoon damaged 32 @,@ 745 houses , including 3 @,@ 837 that were destroyed , forcing 65 @,@ 648 people to evacuate to storm shelters . Overall , Krosa killed four people in the Philippines , and left P273 million ( PHP , $ 6 @.@ 4 million USD ) in damage . After the storm , workers quickly restored power lines , while the government provided monetary assistance to storm @-@ ravaged families , after Cagayan was declared a state of calamity . Members of the Philippine military and Department of Public Works and Highways worked to clean up following the storm . 
- The China National Meteorological Centre issued a " yellow alert " , the second @-@ lowest of the four level warning system , for Hainan due to the threat of the storm . The agency recommended boats to return to port . Agencies in Vietnam also warned for the potential of heavy rainfall due to the dissipating Tropical Depression Krosa , and released water from three dams to prevent overflow . 
- 
- 
- = Kirby 's Block Ball = 
- 
- Kirby 's Block Ball is a 1995 action video game , a spin @-@ off from the Kirby series for the Game Boy portable console . It is a Breakout clone ; the player controls paddles along the screen 's edge to knock a bouncing ball , Kirby , into destructible bricks . The game 's 55 levels include power @-@ ups , bonus rounds , and minigames . Kirby 's Block Ball was developed by HAL Laboratory and Nintendo R & D1 . The team spent half a year revising the gameplay to match Kirby 's signature characteristics . Kirby 's Block Ball was published by Nintendo first in Japan in 1995 , later in Europe , and last in North America in 1996 . 
- Reviewers considered the game an improvement on the Breakout formula and praised its gameplay craftsmanship and incorporation of the Kirby series . It was included in multiple top Game Boy game lists and was later emulated on the Nintendo 3DS Virtual Console . 
- 
- = = Gameplay = = 
- 
- The player controls paddles along the screen 's edges to knock a bouncing ball , Kirby , into destructible bricks . The player loses a life if Kirby hits the edge of the screen . Each of the game 's eleven stages include five rounds of increasingly complex block patterns for Kirby to clear . The ten different block types vary in durability and points value . A well @-@ timed hit of the paddle gives Kirby a powerful bounce to break through harder blocks . Another block type turns the remaining blocks into a bonus round that rewards the player for clearing the screen in the least amount of time . The player can find warp stars that lead to minigames , such as air hockey , where the player can earn extra lives . The rounds also include enemies to attack and avoid . Some enemies contain bonus items . Each stage ends in a boss fight . 
- With stone , needle , flame , and spark power @-@ ups , Kirby can transform to interact with blocks differently . For instance , the spark power @-@ up lets Kirby break through otherwise indestructible blocks , and the needle lets Kirby hit spikes once without losing a life . The game has a themed frame and uses a wide palette of colors in @-@ game when played with the Super Game Boy . 
- 
- = = Development = = 
- 
- The game was developed by HAL Laboratory with Gunpei Yokoi 's Nintendo R & D1 , and published by Nintendo . At one point in development , HAL decided that the game did not feel like a Kirby game . The team spent six months completely revising the game under explicit instructions on how Kirby should move . Kirby games contain elements of unrestricted , creative movement as a general theme . Kirby 's Block Ball was released for the Game Boy first in Japan in 1995 and later in Europe ( 1995 ) and North America ( May 1996 ) . It was later emulated on the Nintendo 3DS Virtual Console , and released first in Japan ( October 2011 ) and later in Europe ( February 2012 ) and North America ( May 2012 ) . 
- 
- = = Reception and legacy = = 
- 
- On release , the four reviewers of Electronic Gaming Monthly applauded Kirby 's Block Ball for modifying the Breakout formula to create a new and enjoyable game . They especially praised the unique power @-@ ups , though Crispin Boyer and Sushi X also felt the game was too short and easy . Nintendo Power said they enjoyed Block Ball and its number of stages , but wondered how its eight megabits of memory were being used . The magazine found the parts where Kirby eats the unbreakable blocks to be innovative . All six of the magazine 's reviewers recommended the game . 
- IGN wrote that the game was primarily remembered as " an Arkanoid or Breakout clone skinned with the Kirby franchise " . IGN calculated an average reviewer score of 7 @.@ 4 / 10 . The Kirby series became known for its number of non @-@ platformer spin @-@ offs , of which Block Ball was one , like Kirby 's Pinball Land and Kirby 's Dream Course . Kirby 's spherical shape lent itself towards ball @-@ like roles . IGN wrote that Block Ball was the first " truly out there " Kirby spin @-@ off , but that the game was too short . 
- Planet Game Boy called it one of the original Game Boy 's ten " all @-@ time classics " and GamesRadar placed it among the top 25 Game Boy games released . They considered Kirby 's Block Ball an improvement upon Alleyway , a Game Boy launch title and Breakout clone . IGN recommended the game upon its 3DS rerelease both in general and for Breakout fans . Nintendo World Report recommended the game to players who like score attack games and called it the best version of Breakout released . Retrospective reviewers found the game enjoyable and praised the craft behind the gameplay and Kirby themes . Alternatively , Kirby 's Block Ball received the lowest rating on Tim Rogers 's 2004 " Yamanote Scoring System for Portable Games " ( a metric by which he played a game while counting stops on the circular Yamanote train line until he lost interest ) with a score of " one " stop . He called it " too damned bland " . 
- In a retrospective review , Jeuxvideo.com had high praise for the level design , graphics , and animations . They also found the music excellent in comparison to the annoying and repetitive soundtrack of most Breakout clones . The magazine also liked how the game fit the Kirby universe , apart from its increased difficulty — Jeuxvideo.com occasionally had trouble hitting the slow @-@ paced ball with precision . 
- 
- 
- = Hannah Dodd = 
- 
- Hannah Dodd ( born 27 April 1992 ) is an Australian Grade IV equestrian and 2 @.@ 0 point wheelchair basketball player who represented Australia in equestrian at the 2012 Summer Paralympics in London , coming 11th and 12th in her events . Switching to wheelchair basketball , she made her debut with the national team at the Osaka Cup in February 2015 . 
- In 2008 , Dodd was the Australian national Grade IV para @-@ equestrian champion . She was runner @-@ up in 2009 , and won the Australian national championships again in 2011 , along with the Oceania Championships and the National Titles team events . By 2012 , she was the top @-@ ranked Australian competitor in her event and class . 
- After the London Paralympics , Dodd took up wheelchair basketball . She started playing for the Sydney University Flames in the Women 's National Wheelchair Basketball League in 2013 , made her debut with the national team at the Osaka Friendship Games in Osaka in February 2015 , winning bronze , and was part of the Under 25 team at the 2015 Women 's U25 Wheelchair Basketball World Championship in Beijing in July 2015 , winning silver . 
- 
- = = Personal = = 
- 
- Hannah Dodd was born on 27 April 1992 , and is from Arcadia , New South Wales . She has sacral agenesis and spina bifida with upper limb dystonia , and is missing four vertebrae in her back . When she was about a year old , her kidneys started failing . Her entire renal system needed to be reconstructed . She has two older brothers . She can walk with the aide of a caliper , and also uses a wheelchair . As of 2012 , she is a horse riding teacher and student at the University of Western Sydney where she is majoring in sports and exercise science . 
- 
- = = Equestrian = = 
- 
- Dodd is a Grade IV equestrian competitor , coached by Peter Turner . Due to her sacral agenesis , when she rides her horse , she dislocates several bones every time , but as a result of anti @-@ doping rules , she has had to find alternative ways of coping with pain associated with riding . 
- Dodd has been around horses since she was four months old , and was able to ride on her own by the time she was two years old , before she learned to walk . The sport gave her a degree of independence . She started competing in 2005 , and first represented Australia in 2006 , winning her first test in England that year.In 2008 , she became the youngest @-@ ever winner of the Australian national championships . She finished first at the March 2009 inter @-@ schools cup at the St Ives Showground , and second at the 2009 Australian national championships , but her horse , Lucifer 's Dream , was injured in 2009 . In 2009 and 2010 , she searched for another horse to assist her in getting through Paralympic qualification . She won the Australian national championships again in 2011 , along with the Oceania Championships and the National Titles team events . By 2012 , she was the top @-@ ranked Australian competitor in her event and class . 
- Dodd was selected to represent Australia at the 2012 Summer Paralympics in London in equestrian events with her horse Waikiwi . These Games were her first , and she was the youngest Australian equestrian competitor . A fund raiser was organised by Arcadia , New South Wales , residents . While her own costs and the cost of her horse were covered by Australian Paralympic Committee and Equestrian Australia , funds were required for her coach . She was placed 12th in the Individual Championship Test – Grade IV , and 11th in the Individual Freestyle Test – Grade IV and Team Test – Grade IV . 
- 
- = = Wheelchair basketball = = 
- 
- After the London Paralympics , Dodd took up wheelchair basketball . She started for the Sydney University Flames in the Women 's National Wheelchair Basketball League in 2013 . She has to strap her fingers and wrists , and usually dislocates a shoulder during a game . " I 've had a few bangs and scrapes and been tipped out of my chair a few times , " she concedes , " but it 's really fun . The fast pace really gives you an adrenalin kick and the girls I play with are awesome . " " If I have chose between my two sports for Rio , " she said , " I will go with basketball . " She made her debut with the national team , known as the Gliders , at the Osaka Cup in Osaka in February 2015 . The Gliders won bronze . In June 2015 , Dodd was selected as part of the under 25 team ( known as the Devils ) for the 2015 Women 's U25 Wheelchair Basketball World Championship in Beijing in July . The Devils won silver . By this time her health had deteriorated . She had to use a wheelchair much of the time , and her classification had dropped to a 2 @.@ 5 point player . In 2015 , she was reclassified a 2 @.@ 0 . 
- 
- 
- = Commonwealth War Graves Commission = 
- 
- The Commonwealth War Graves Commission ( CWGC ) is an intergovernmental organisation of six independent member states whose principal function is to mark , record and maintain the graves and places of commemoration of Commonwealth of Nations military service members who died in the two World Wars . The Commission is also responsible for commemorating Commonwealth civilians who died as a result of enemy action during World War II . The Commission was founded by Fabian Ware and constituted through Royal Charter in 1917 named the Imperial War Graves Commission . The change to the present name took place in 1960 . 
- The Commission , as part of its mandate , is responsible for commemorating all Commonwealth war dead individually and equally . To this end , the war dead are commemorated by name on a headstone , at an identified site of a burial , or on a memorial . War dead are commemorated uniformly and equally , irrespective of military or civil rank , race or creed . 
- The Commission is currently responsible for the continued commemoration of 1 @.@ 7 million deceased Commonwealth military service members in 153 countries . Since its inception , the Commission has constructed approximately 2 @,@ 500 war cemeteries and numerous memorials . The Commission is currently responsible for the care of war dead at over 23 @,@ 000 separate burial sites and the maintenance of more than 200 memorials worldwide . In addition to commemorating Commonwealth military service members , the Commission maintains , under arrangement with applicable governments , over 40 @,@ 000 non @-@ Commonwealth war graves and over 25 @,@ 000 non @-@ war military and civilian graves . The Commission operates through the continued financial support of the member states : United Kingdom , Canada , Australia , New Zealand , India and South Africa . The current President of the Commonwealth War Graves Commission is Prince Edward , Duke of Kent . 
- 
- = = History = = 
- 
- 
- = = = World War I = = = 
- 
- On the outbreak of World War I in 1914 , Fabian Ware , a director of the Rio Tinto Company , found that at 45 years old he was too old to join the British Army . He used the influence of Rio Tinto chairman , Viscount Milner , to become the commander of a mobile unit of the British Red Cross . He arrived in France in September 1914 and whilst there was struck by the lack of any official mechanism for documenting or marking the location of graves of those who had been killed and felt compelled to create an organisation within the Red Cross for this purpose . In March 1915 , with the support of Nevil Macready , Adjutant @-@ General of the British Expeditionary Force , Ware 's work was given official recognition and support by the Imperial War Office and the unit was transferred to the British Army as the Graves Registration Commission . The new Graves Registration Commission had over 31 @,@ 000 graves of British and Imperial soldiers registered by October 1915 and 50 @,@ 000 registered by May 1916 . 
- When municipal graveyards began to overfill Ware began negotiations with various local authorities to acquire land for further cemeteries . Ware began with an agreement with France to build joint British and French cemeteries under the understanding that these would be maintained by the French government . Ware eventually concluded that it was not prudent to leave the maintenance responsibilities solely to the French government and subsequently arranged for France to purchase the land , grant it in perpetuity , and leave the management and maintenance responsibilities to the British . The French government agreed under the condition that cemeteries respected certain dimensions , were accessible by public road , were in the vicinity of medical aid stations and were not too close to towns or villages . Similar negotiations were started with the Belgian government . 
- As reports of the grave registration work became public , the Commission began to receive letters of enquiry and requests for photographs of graves from relatives of deceased soldiers . By 1917 , 17 @,@ 000 photographs had been dispatched to relatives . In March 1915 , the Commission , with the support of the Red Cross , began to dispatch photographic prints and cemetery location information in answer to the requests . The Graves Registration Commission became the Directorate of Graves Registration and Enquiries in the spring of 1916 in recognition of the fact that the scope of work began to extend beyond simple grave registration and began to include responding to enquiries from relatives of those killed . The directorate 's work was also extended beyond the Western Front and into other theatres of war , with units deployed in Greece , Egypt and Mesopotamia . 
- 
- = = = Formal establishment = = = 
- 
- As the war continued , Ware and others became concerned about the fate of the graves in the post @-@ war period . Following a suggestion by the British Army , the National Committee for the Care of Soldiers ' Graves was appointed by the British government in January 1916 , with Edward , Prince of Wales agreeing to serve as president . The National Committee for the Care of Soldiers ' Graves was created with the intention of taking over the work of the Directorate of Graves Registration and Enquiries after the war . The government felt that it was more appropriate to entrust the work to a specially appointed body rather than to any existing government department . By early 1917 a number of members of the committee believed a formal imperial organisation would be needed to care for the graves . With the help of Edward , Prince of Wales , Ware submitted a memorandum to the Imperial War Conference in 1917 suggesting that an imperial organisation be constituted . The suggestion was accepted and on 21 May 1917 the Imperial War Graves Commission was established by Royal Charter , with the Prince of Wales serving as president , Secretary of State for War Lord Derby as chairman and Ware as vice @-@ chairman . The Commission 's undertakings began in earnest at the end of the First World War . Once land for cemeteries and memorials had been guaranteed , the enormous task of recording the details of the dead could begin . By 1918 , some 587 @,@ 000 graves had been identified and a further 559 @,@ 000 casualties were registered as having no known grave . 
- The scale , and associated high number of casualties , of the war produced an entirely new attitude towards the commemoration of war dead . Previous to World War I , individual commemoration of war dead was often on an ad hoc basis and was almost exclusively limited to commissioned officers . However , the war required mobilisation of a significant percentage of the population , either as volunteers or through conscription . An expectation had consequently arisen that individual soldiers would expect to be commemorated , even if they were low @-@ ranking members of the military . A committee under Frederic Kenyon , Director of the British Museum , presented a report to the Commission in November 1918 detailing how it envisioned the development of the cemeteries . Two key elements of this report were that bodies should not be repatriated and that uniform memorials should be used to avoid class distinctions . Beyond the logistical nightmare of returning home so many corpses , it was felt that repatriation would conflict with the feeling of brotherhood that had developed between serving ranks . 
- An article in The Times on 17 February 1919 by Rudyard Kipling carried the Commission 's proposal to a wider audience and described what the graves would look like . The article entitled War Graves : Work of Imperial Commission : Mr. Kipling 's Survey was quickly republished as an illustrated booklet , Graves of the Fallen . The illustrated booklet was intended to soften the impact of Kenyon 's report as it included illustrations of cemeteries with mature trees and shrubs ; contrasting the bleak landscapes depicted in published battlefield photos . There was an immediate public outcry following the publication of the reports , particularly with regards to the decision to not repatriate the bodies of the dead . The reports generated considerable discussion in the press which ultimately led to a heated debate in Parliament on 4 May 1920 . Sir James Remnant started the debate , followed by speeches by William Burdett @-@ Coutts in favour of the Commission 's principles and Robert Cecil speaking for those desiring repatriation and opposing uniformity of grave markers . Winston Churchill closed the debate and asked that the issue not proceed to a vote . Remnant withdrew his motion , allowing the Commission to carry out its work assured of support for its principles . 
- 
- = = = First cemeteries and memorials to the missing = = = 
- 
- Three of the most eminent architects of their day , Sir Herbert Baker , Sir Reginald Blomfield , and Sir Edwin Lutyens were commissioned to design the cemeteries and memorials . Rudyard Kipling was appointed literary advisor for the language used for memorial inscriptions . 
- In 1920 , the Commission built three experimental cemeteries at Le Treport , Forceville and Louvencourt , following the principles outlined in the Kenyon report . Of these , the Forceville Communal Cemetery and Extension was agreed to be the most successful . Having consulted with garden designer Gertrude Jekyll , the architects created a walled cemetery with uniform headstones in a garden setting , augmented by Blomfield 's Cross of Sacrifice and Lutyens ' Stone of Remembrance . After some adjustments , Forceville became the template for the Commission 's building programme . Adjustments were required because all three experimental cemeteries went over budget . To ensure future cemeteries remained within their budget the Commission decided to not build shelters in cemeteries that contained less than 200 graves , to not place a Stone of Remembrance in any cemetery with less than 400 graves , and to limit the height of cemetery walls to 1 metre ( 3 @.@ 3 ft ) . 
- At the end of 1919 , the Commission had spent £ 7 @,@ 500 , and this figure rose to £ 250 @,@ 000 in 1920 as construction of cemeteries and memorials increased . By 1921 , the Commission had established 1 @,@ 000 cemeteries which were ready for headstone erections , and burials . Between 1920 and 1923 , the Commission was shipping 4 @,@ 000 headstones a week to France . In many cases small cemeteries were closed and the graves concentrated in larger ones . By 1927 , when the majority of construction had been completed , over 500 cemeteries had been built , with 400 @,@ 000 headstones , a thousand Crosses of Sacrifice , and 400 Stones of Remembrance . 
- The Commission had also been mandated to individually commemorate each soldier who had no known grave , which amounted to 315 @,@ 000 in France and Belgium alone . The Commission initially decided to build 12 monuments on which to commemorate the missing ; each memorial being located at the site of an important battle along the Western Front . After resistance from the French committee responsible for the approvals of memorials on French territory , the Commission revised their plan and reduced the number of memorials , and in some cases built memorials to the missing in existing cemeteries rather than as separate structures . 
- Reginald Blomfield 's Menin Gate was the first memorial to the missing located in Europe to be completed , and was unveiled on 24 July 1927 . The Menin Gate ( Menenpoort ) was found to have insufficient space to contain all the names as originally planned and 34 @,@ 984 names of the missing were instead inscribed on Herbert Baker 's Tyne Cot Memorial to the Missing . Other memorials followed : the Helles Memorial in Gallipoli designed by John James Burnet ; the Thiepval Memorial on the Somme and the Arras Memorial designed by Edwin Lutyens ; and the Basra Memorial in Iraq designed by Edward Prioleau Warren . The Dominions and India also erected memorials on which they commemorated their missing : the Neuve @-@ Chapelle Memorial for the forces of India , the Vimy Memorial by Canada , the Villers @-@ Bretonneux Memorial by Australia , the Delville Wood Memorial by South Africa and the Beaumont @-@ Hamel Memorial by Newfoundland . The programme of commemorating the dead of the Great War was considered essentially complete with the inauguration of the Thiepval Memorial in 1932 , though the Vimy Memorial would not be finished until 1936 , the Villers @-@ Bretonneux Memorial until 1938 and stonemasons were still conducting work on the Menin Gate when Germany invaded Belgium in 1940 . 
- The only memorial created by the Commission that was not in the form of a monument or cemetery was the Opththalmic Institute at Giza , Egypt — complete with library , and bacteriology and pathology departments — as its memorial to men of the Egyptian Labour Corps and Camel Transport Corps . Its erection was agreed with local political pressure . 
- 
- = = = World War II = = = 
- 
- From the start of the Second World War in 1939 , the Commission organised grave registration units and , planning ahead based on the experience gained from the First World War , earmarked land for use as cemeteries . When the war began turning in favour of the Allies , the Commission was able to begin restoring its First World War cemeteries and memorials . It also began the task of commemorating the 600 @,@ 000 Commonwealth casualties from the Second World War . In 1949 , the Commission completed Dieppe Canadian War Cemetery , the first of 559 new cemeteries and 36 new memorials . Eventually , over 350 @,@ 000 new headstones were erected . Many were made from Hopton Wood stone . The wider scale of World War II , coupled with manpower shortages and unrest in some countries , meant that the construction and restoration programmes took much longer . Following the war , the Commission implemented a five @-@ year horticultural renovation programme . The horticultural neglect was largely addressed by 1950 but there were necessary structural repairs to be made . These , together with the backlog of maintenance tasks from before the war , took a further 10 years to complete and the programme was not completed until the 1960s . 
- With the increased number of civilian casualties compared with the World War I , Winston Churchill agreed to Ware 's proposal that the Commission also maintain a record of Commonwealth civilian war deaths . A supplemental chapter was added to the Imperial War Graves Commission 's charter on 7 February 1941 , empowering the organisation to collect and record the names of civilians who died from enemy action during the Second World War , which resulted in the creation of the Civilian War Dead Roll of Honour . The roll eventually contained the names of nearly 67 @,@ 000 civilians . The Commission and the Dean of Westminster reached an agreement that the roll would eventually be placed in Westminster Abbey but not until the roll was complete and hostilities had ended . The Commission handed over the first six volumes to the Dean of Westminster on 21 February 1956 ; the final volume was added to the showcase in 1958 . 
- 
- = = = Post – World War II = = = 
- 
- Following World War II the Commission recognised that the word ' Imperial ' within its name was no longer appropriate . In the spirit of strengthening national and regional feelings the organisation 's name was changed to Commonwealth War Graves Commission in 1960 . 
- More recent conflicts have sometimes made it impossible for the Commission to care for cemeteries in a given region or resulted in the destruction of sites altogether . Zehrensdorf Indian Cemetery in Germany was unkempt after the end of World War II and until the German reunification because it was located in an area occupied by Russian forces and was not entirely rebuilt until 2005 . The Six @-@ Day War and War of Attrition resulted in the destruction of Port Tewfik Memorial and Aden Memorial , and the death of a Commission gardener at Suez War Memorial Cemetery . During the Lebanese Civil War two cemeteries in Beirut were destroyed and had to be rebuilt . The maintenance of war graves and memorials in Iraq has remained difficult since Iran – Iraq War in the 1980s , with regular maintenance being impractical since after the Gulf War . 
- The Commission has , and continues to , also provide support for war graves outside its traditional mandate . In 1982 , the British Ministry of Defence requested the Commission 's assistance to design and construct cemeteries in the Falkland Islands for those killed during the Falklands War . Although these cemeteries are not Commonwealth War Graves Commission cemeteries , the Commission manages the administrative responsibilities of these cemeteries . Since 2005 , the Commission has carried out similar management duties on behalf of the British Ministry of Defence for cemeteries and graves of British and Imperial soldiers who died during the Second Boer War . In 2003 , Veterans Affairs Canada employed the Commission to develop an approach to locate grave markers for which the Canadian Minister of Veterans Affairs has responsibility . As of 2011 , the Commission conducts a twelve @-@ year cyclical inspection programme of Canadian veterans ' markers installed at the expense of the Government of Canada . 
- In 2008 , an exploratory excavation discovered mass graves on the edge of Pheasant Wood outside of Fromelles . Two @-@ hundred and fifty British and Australian bodies were excavated from five mass graves which were interred in the newly constructed Fromelles ( Pheasant Wood ) Military Cemetery . This was the first new Commonwealth War Graves Commission cemetery in more than 50 years , the last such cemeteries having been built after the Second World War . 
- 
- = = Burial sites and memorials = = 
- 
- The Commission is currently responsible for the continued commemoration of 1 @.@ 7 million deceased Commonwealth military service members in 153 countries and approximately 67 @,@ 000 civilians who died as a result of enemy action during World War II . Commonwealth military service members are commemorated by name on either a headstone , at an identified site of a burial , or on a memorial . As a result , the Commission is currently responsible for the care of war dead at over 23 @,@ 000 separate burial sites and maintenance of more than 200 memorials worldwide . The vast majority of burial sites are pre @-@ existing communal or municipal cemeteries and parish churchyards located in the United Kingdom , however the Commission has itself constructed approximately 2 @,@ 500 war cemeteries worldwide . The Commission has also constructed or commissioned memorials to commemorate the dead who have no known grave ; the largest of these is the Thiepval Memorial . 
- 
- = = = Qualifications for inclusion = = = 
- 
- The Commission only commemorates those who have died during the designated war years , while in Commonwealth military service or of causes attributable to service . The applicable periods of consideration are 4 August 1914 to 31 August 1921 for the First World War and 3 September 1939 to 31 December 1947 for the Second World War . The end date for the First World War period is the official end of the war , while for the Second World War the Commission selected a date approximately the same period after VE Day as the official end of the First World War was after the 1918 Armistice . 
- Civilians who died as a result of enemy action during the Second World War are commemorated differently from those that died as a result of military service . They are commemorated by name through the Civilian War Dead Roll of Honour located in St George 's Chapel in Westminster Abbey . In addition to its mandated duties , the Commission maintains , under arrangement with applicable governments , over 40 @,@ 000 non @-@ Commonwealth war graves and over 25 @,@ 000 non @-@ war military and civilian graves . 
- 
- = = = Architects and sculptors = = = 
- 
- As well as the main Principal Architects for France and Belgium ( Baker , Blomfield and Lutyens ) , there were Principal Architects appointed for other regions as well . Sir Robert Lorimer was Principal Architect for Italy , Macedonia and Egypt , while Sir John James Burnet was Principal Architect for Palestine and Gallipoli , assisted by Thomas Smith Tait . The Principal Architect for Mesopotamia was Edward Prioleau Warren . 
- As well as these senior architects , there was a team of Assistant Architects who were actually responsible for many of the cemetery and memorial designs . These architects were younger , and many of them had served in the war . The Assistant Architects were : George Esselmont Gordon Leith , Wilfred Clement von Berg , Charles Henry Holden ( who in 1920 became a Principal Architect ) , William Harrison Cowlishaw , William Bryce Binnie , George Hartley Goldsmith , Frank Higginson , Arthur James Scott Hutton , Noel Ackroyd Rew , and John Reginald Truelove . Other architects that worked for the Commission , or won competitions for the Commission memorials , included George Salway Nicol , Harold Chalton Bradshaw , Verner Owen Rees , Gordon H. Holt , and Henry Philip Cart de Lafontaine . 
- In January 1944 , Edward Maufe was appointed Principal Architect for the UK . Maufe worked extensively for the Commission for 25 years until 1969 , becoming Chief Architect and also succeeding Kenyon as Artistic Advisor . Together with Maufe , the other Principal Architects appointed during and after the Second World War were Hubert Worthington , Louis de Soissons , Philip Hepworth and Colin St Clair Oakes . 
- Leading sculptors that worked on the memorials and cemeteries after the First World War included Eric Henri Kennington , Charles Thomas Wheeler , Gilbert Ledward , and Charles Sargeant Jagger . Other sculptors , both in the inter @-@ war period and after the Second World War , included William Reid Dick , Ernest Gillick , Basil Gotto , Alfred Turner , Laurence A. Turner , Walter Gilbert , Henry Poole , Vernon Hill , Robert Anning Bell , Ferdinand Victor Blundstone , Joseph Armitage , and Gilbert Bayes . 
- 
- = = = Cemetery design = = = 
- 
- 
- = = = = Common architectural design features = = = = 
- 
- Structural design has always played an important part in the Commission 's cemeteries . Apart from a few exceptions , due to local geological conditions , the cemeteries follow the same design and uniform aesthetic all over the world . This makes the cemeteries easily recognisable and distinguishes them from war graves administered by other groups or countries . 
- A typical cemetery is surrounded by a low wall or hedge and with a wrought @-@ iron gate entrance . For cemeteries in France and Belgium , a land tablet near the entrance or along a wall identifies the cemetery grounds as having been provided by the French or Belgian governments . All but the smallest cemeteries contain a register with an inventory of the burials , a plan of the plots and rows , and a basic history of the cemetery . The register is located within a metal cupboard that is marked with a cross located in either the wall near the cemetery entrance or in a shelter within the cemetery . More recently , in larger sites , a stainless steel notice gives details of the respective military campaign . The headstones within the cemetery are of a uniform size and design and mark plots of equal size . 
- The cemetery grounds are , except in drier climates , grass covered with a floral border around the headstones . There is also an absence of any paving between the headstone rows which is intended to make the cemetery feel like a traditional walled garden where visitors could experience a sense of peace . However , Carter and Jackson argue that the uniform aesthetics are designed to evoke a positive experience which deliberately masks and sanitises the nature of the war deaths . 
- 
- = = = = Cross of Sacrifice and Stone of Remembrance = = = = 
- 
- Typically , cemeteries of more than 40 graves contain a Cross of Sacrifice designed by architect Reginald Blomfield . This cross was designed to imitate medieval crosses found in churchyards in England with proportions more commonly seen in the Celtic cross . The cross is normally a freestanding four @-@ point limestone Latin cross , mounted on an octagonal base , and ranging in height from 14 to 32 feet . A bronze longsword , blade down , is embedded on the face of the cross . This cross represents the faith of the majority of the dead and the sword represents the military character of the cemetery , intended to link British soldiers and the Christian concept of self @-@ sacrifice . 
- Cemeteries with more than 1000 burials typically have a Stone of Remembrance , designed by Edwin Lutyens with the inscription " Their Name Liveth for Evermore " . The concept of the Stone of Remembrance stone was developed by Rudyard Kipling to commemorate those of all faiths and none respectively . In contrast to the Cross of Sacrifice , the design for the stone deliberately avoided " shapes associated with particular religions " . The geometry of the structure was based on studies of the Parthenon . Each stone is 3 @.@ 5 metres ( 11 ft ) long and 1 @.@ 5 metres ( 4 @.@ 9 ft ) high . The shape of the stone has been compared both to that of a sarcophagus and an altar . The feature was designed using the principle of entasis . The subtle curves in the design , if extended , would form a sphere 1 @,@ 801 feet 8 inches ( 549 @.@ 15 m ) in diameter . 
- 
- = = = = Headstones = = = = 
- 
- Every grave is marked with a headstone . Each headstone contains the national emblem or regimental badge , rank , name , unit , date of death and age of each casualty inscribed above an appropriate religious symbol and a more personal dedication chosen by relatives . The headstones use a standard upper case lettering designed by MacDonald Gill . Individual graves are arranged , where possible , in straight rows and marked by uniform headstones , the vast majority of which are made of Portland stone . The original headstone dimensions were 76 centimetres ( 30 in ) tall , 38 cm ( 15 in ) wide , and 7 @.@ 6 cm ( 3 @.@ 0 in ) thick . 
- Most headstones are inscribed with a cross , except for those deceased known to be atheist or non @-@ Christian . In the case of burials of Victoria Cross or George Cross recipients , the regimental badge is supplemented by the Victoria Cross or George Cross emblem . Sometimes a soldier employed a pseudonym because they were too young to serve or were sought by law enforcement ; in such cases their primary name is shown along with the notation " served as " . Many headstones are for unidentified casualties ; they consequently bear only what could be discovered from the body . The epitaph , developed by Rudyard Kipling , that appears on the graves of unidentified soldiers for which no details are known is " A Soldier of the Great War known unto God " . Some headstones bear the text " believed to be buried in this cemetery " when they are believed to be buried in the cemetery but the exact location of the grave is not known . In some cases soldiers were buried in collective graves and distinguishing one body from another was not possible and thus one headstone covers more than one grave . The headstone does not denote any specific details of the death except for its date , and even then only if it is known , and are deliberately ambiguous about the cause of death . 
- Due to local conditions it was sometimes necessary for the Commission to deviate from its standard design . In places prone to extreme weather or earthquakes , such as Thailand and Turkey , stone @-@ faced pedestal markers are used instead of the normal headstones . These measures are intended to prevent masonry being damaged during earthquakes or sinking into sodden ground . In Italy headstones were carved from Chiampo Perla limestone because it was in more plentiful supply . In Struma Military Cemetery , in Greece , to avoid risk of earthquake damage , small headstones are laid flat on the ground . The smaller size of the markers mean that they often lack unit insignia . 
- 
- = = = = Horticulture = = = = 
- 
- Commission cemeteries are distinctive in treating floriculture as an integral part of the cemetery design . Originally , the horticultural concept was to create an environment where visitors could experience a sense of peace in a setting , in contrast to traditionally bleak graveyards . Recommendations given by Arthur William Hill , the Assistant Director of the Royal Botanical Gardens at Kew enabled the Commission to develop cemetery layouts and architectural structures that took into account the placement of suitable plant life . Combining structural and horticultural elements was not unfamiliar to the Commission 's architects . Sir Edwin Lutyens furthered his long @-@ standing working relationship with horticulturist Gertrude Jekyll , whose devotion to traditional cottage garden plants and roses greatly influenced the appearance of the cemeteries . Where possible , indigenous plants were utilised to enhance sentimental associations with the gardens of home . 
- Variety in texture , height and timing of floral display were equally important horticultural considerations . The beds around each headstone are planted with a mixture of floribunda roses and herbaceous perennials . Low @-@ growing plants are chosen for areas immediately in front of headstones , ensuring that inscriptions are not obscured and preventing soil from splashing back during rain . In cemeteries where there are pedestal grave markers , dwarf varieties of plants are used instead . 
- The absence of any form of paving between the headstone rows contributes to the simplicity of the cemetery designs . Lawn paths add to the garden ambiance , and are irrigated during the dry season in countries where there is insufficient rain . Where irrigation is inappropriate or impractical , dry landscaping is an ecological alternative favoured by the Commission 's horticulturists , as is the case in Iraq . Drier areas require a different approach not only for lawns , but also to plants and styles of planting . Similarly , there are separate horticultural considerations in tropical climates . When many cemeteries are concentrated within a limited area , like along the Western Front or Gallipoli peninsula , mobile teams of gardeners operate from a local base . Elsewhere , larger cemeteries have their own dedicated staff while small cemeteries are usually tended by a single gardener working part @-@ time . 
- 
- = = Organisation = = 
- 
- 
- = = = Commissioners = = = 
- 
- The affairs of the CWGC are overseen by a Board of Commissioners . The president of the board is Prince Edward , Duke of Kent , the chairman is United Kingdom Secretary of State for Defence Michael Fallon and the vice @-@ chairman Vice @-@ Admiral Tim Laurence . The members are : the High Commissioner for New Zealand to the United Kingdom Lockwood Smith , the High Commissioners of Australia to the United Kingdom Alexander Downer , the Acting High Commissioner of the Republic of South Africa to the United Kingdom Obed Mlaba , the High Commissioner for India to the United Kingdom Ranjan Mathai , the High Commissioner for Canada to the United Kingdom Gordon Campbell , Hew Strachan , Keith Simpson , Kevan Jones , Edward Chaplin , Robert Fox , Ros Kelly and Lieutenant General Bill Rollo . Victoria Wallace is the Director @-@ General of the CWGC and serves as secretary . The board also has an Honorary Artistic Adviser , Peter Inskip . 
- 
- = = = Functional structure = = = 
- 
- The CWGC is headquartered in Maidenhead , England . Offices or agencies that are each responsible for a specific geographical area manage the worldwide affairs of the organisation . They are : 
- France Area is headed by a director and is responsible for France ( including the island of Corsica ) , Monaco and Switzerland . 
- Northern Europe Area , headed by a director and responsible for Austria , Belgium , Czech Republic , Denmark , Estonia , Germany , Hungary , Latvia , Lithuania , Luxembourg , Netherlands , Norway , Poland and Sweden . 
- United Kingdom Area , headed by a director and responsible for Channel Islands , Faroe Islands , Iceland , Ireland , Isle of Man and the United Kingdom 
- Mediterranean Area headed by a director and responsible for Albania , Algeria , Azerbaijan , Azores , Bahrain , Canary Islands , Croatia , Cyprus , Egypt , Gibraltar , Greece , Israel and Palestine , Italy , Jordan , Lebanon , Libya , Macedonia , Madeira , Malta , Mauritania , Morocco , Oman , Portugal , San Marino , Saudi Arabia , Serbia , Spain , Syria , Tunisia , Turkey , United Arab Emirates and Yemen 
- Canadian Agency is headed by a secretary @-@ general and responsible for Canada , the entire Americas ( including the Caribbean ) 
- Australia , managed by the Office of Australian War Graves in the Australian Department of Veterans Affairs on behalf of the CWGC , is responsible for Australia , Norfolk Island , Papua New Guinea and the Solomon Islands 
- New Zealand , managed by the New Zealand Ministry of Culture and Heritage on behalf of the CWGC , is responsible for New Zealand , New Caledonia , Samoa , Society Islands , Tonga and Vanuatu 
- South Africa Agency is headed by a secretary and is responsible for Republic of South Africa , Namibia , Saint Helena and Ascension Island 
- Africa , Asia and Pacific Area is headed by a director and is responsible for areas not covered by any of the other bodies . 
- 
- = = = Financing = = = 
- 
- The CWGC 's work is funded predominantly by grants from the governments of the six member states . In the fiscal year 2012 / 13 , these grants amounted to £ 58 @.@ 6 million of the organisation 's £ 66 @.@ 5 million of income . This equates to an approximate cost of C $ 85 per commemorated war dead . The contribution from each country is proportionate to the number of graves the CWGC maintains on behalf of that country . The percentage of total annual contributions for which each country is responsible is United Kingdom 78 @.@ 4 % , Canada 10 @.@ 1 % , Australia 6 @.@ 1 % , New Zealand 2 @.@ 1 % , South Africa 2 @.@ 1 % and India 1 @.@ 2 % . 
- 
- = = Ongoing projects and issues = = 
- 
- 
- = = = War Graves Photographic Project = = = 
- 
- A project is underway to photograph the graves of and memorials to all service personnel from 1914 to the present day and make the images available to the public . The work is being carried out by The War Graves Photographic Project in conjunction with the CWGC . As of August 2013 , the project has recorded 1 @.@ 7 million photographs for posterity . 
- 
- = = = Reburials and identifications = = = 
- 
- Immediately following the First World War , the British Army remained responsible for the exhumation of remains . The Western Front was divided into sectors and combed for bodies by 12 @-@ man exhumation units . Between the Armistice and September 1921 , the exhumation units reburied 204 @,@ 695 bodies . After 1921 , no further widespread search for bodies was undertaken and in February 1921 responsibility of the cemeteries was transferred to the Commission . Despite the rigorous searches , bodies continued to be discovered in numbers . In the three years following the conclusion of the general search 38 @,@ 000 bodies were discovered . In the mid 1920s , 20 to 30 bodies were being discovered weekly . 
- The discovery of remains of First and Second World War casualties remains a common occurrence with approximately 30 bodies discovered annually . For example , in 2006 eight bodies of Canadian soldiers from the 78th Battalion ( Winnipeg Grenadiers ) , CEF were discovered in a backyard in Hallu , France . In April 2013 , the remains of four British soldiers discovered by a French farmer clearing land with metal detector in 2009 were re @-@ interred at H.A.C. Cemetery near Arras , France . In March 2014 , the remains of 20 Commonwealth and 30 German soldiers were discovered in Vendin @-@ le @-@ Vieil , France with the Commonwealth soldiers being subsequently reburied at Loos British Cemetery . 
- When the remains of a Commonwealth soldier from the First or Second World War is discovered the Commission is notified and a Commission burial officer tries to collect any associated artifacts that may help in identify the remains . The details are then registered and archived at the Commission 's headquarters. the collection of evidence can include artifacts with the remains , anthropological data and DNA . The archival records of the commission are open to the public to permit individuals to conduct their own research . Investigation of archival records by members of the public periodically result in the identification of previously buried casualties . In December 2013 , it was discovered that Second Lieutenant Philip Frederick Cormack , who was previously commemorated on the Arras Flying Services Memorial , had in fact been buried in a French military cemetery in Machelen , East @-@ Flanders in Belgium . Sergeant Leonard Maidment was identified in 2013 after a visitor to Marfaux British Cemetery discovered a headstone of an unknown sergeant with the Hampshire Regiment killed on 20 July 1918 and was subsequently able to show that only one sergeant from that regiment had been killed in France on that date . 
- 
- = = = Vandalism = = = 
- 
- Cemeteries , including those of war dead , are targets for vandalism . The gravestones , cemeteries and buildings of the Commission are no exception . The Commission believes that graffiti and damage to stonework are usually the pursuits partaken by young people , noting the number of incidents increases when schoolchildren are on school holidays . Determined thieves will also steal the bronze swords off the Cross of Sacrifice , which are now replaced with identical ones made in fibreglass . 
- The vandalism of Commission cemeteries has also been connected to the participation of Commonwealth countries in contemporary conflicts . In the 1970s , in The Troubles , Commission cemeteries in Ireland experienced vandalism . Vandals defaced the central memorial of the Étaples Military Cemetery in northern France with anti @-@ British and anti @-@ American graffiti on 20 March 2003 immediately after the beginning of the Iraq War . On 9 May 2004 , thirty @-@ three headstones were demolished in the Gaza cemetery , which contains 3 @,@ 691 graves , allegedly in retaliation for the Abu Ghraib prisoner abuse scandal . On 24 February 2012 , during the Libyan Civil War , an Islamist militia damaged over 200 headstones in the Benghazi war cemetery as well as the central memorial . 
- 
- 
- = Tatwine = 
- 
- Tatwine or Tatwin ( Tatuini or Tadwinus ; c . 670 – 734 ) was the tenth Archbishop of Canterbury from 731 to 734 . Prior to becoming archbishop , he was a monk and abbot of a Benedictine monastery . Besides his ecclesiastical career , Tatwine was a writer , and riddles he composed survive . Another work he composed was on the grammar of the Latin language , which was aimed at advanced students of that language . He was subsequently considered a saint . 
- 
- = = Biography = = 
- 
- Tatwine was a Mercian by birth . His epigraph at Canterbury stated that when he died he was in old age , so perhaps he was born around 670 . He became a monk at the monastery at Breedon @-@ on @-@ the @-@ Hill in the present @-@ day County of Leicestershire , and then abbot of that house . Through the influence of King Æthelbald he was appointed as Archbishop of Canterbury in 731 and was consecrated on 10 June 731 . He was one of a number of Mercians who were appointed to Canterbury during the 730s and 740s . Apart from his consecration of the Bishops of Lindsey and Selsey in 733 , Tatwine 's period as archbishop appears to have been uneventful . He died in office on 30 July 734 . Later considered a saint , his feast day is 30 July . 
- 
- = = Writings = = 
- 
- Bede 's commentary on Tatwine calls him a " vir religione et Prudentia insignis , sacris quoque literis nobiliter instructus " ( a man notable for his prudence , devotion and learning ) . These qualities were displayed in the two surviving manuscripts of his riddles and four of his Ars Tatuini . The Ars is one of only two surviving 8th @-@ century Latin grammars from England , and was based on the works of Priscian and Consentius . The riddles deal with such diverse topics as philosophy and charity , the five senses and the alphabet , and a book and a pen . The riddles are formed in acrostics . The grammar is a reworking of Donatus 's Ars Minor with the addition of information drawn from other grammarians . It was not designed for a newcomer to the Latin language , but is designed for more advanced students . It covers the eight parts of speech through illustrations drawn from classical scholars , although not directly but through other grammatical works . There are also some examples drawn from the Psalms . The work was completed before he became archbishop , and was used not only in England but also on the continent . A recent edition of his works is Tatuini Opera omnia , published in 1968 with some translations into English and German from the original Latin . 
- 
- 
- = German Type UB I submarine = 
- 
- The Type UB I was a class of small coastal submarines ( U @-@ boats ) built in Germany at the beginning of the First World War . 20 boats were constructed , most of which went into service with the German Imperial Navy . Boats of this design were also operated by the Austro @-@ Hungarian Navy ( Kaiserliche und Königliche Kriegsmarine or K.u.K. Kriegsmarine ) and the Bulgarian Navy . The group is sometimes known as the UB @-@ 1 class after SM UB @-@ 1 , the class leader . In the Austro @-@ Hungarian Navy , it was called the U @-@ 10 class . 
- Built to meet the need for small maneuverable submarines able to operate in the narrow , shallow seas off Flanders , the vessels were intended to be quickly constructed , then shipped by rail and assembled at their port of operation . The design effort began in mid @-@ August 1914 and by mid @-@ October the first 15 boats were ordered from two German shipyards . The German Imperial Navy subsequently ordered an additional pair of boats to replace two sold to Austria @-@ Hungary , who ordered a further three boats in April 1915 . A total of 20 UB Is were built . Construction of the first boats for Germany began in early November 1914 ; all 20 were completed by October 1915 . Several of the first boats underwent trials in German home waters , but the rest were assembled and tested at either Antwerp or Pola . The German boats operated primarily in the Flanders , Baltic , and Constantinople Flotillas . The boats were about 28 metres ( 92 ft ) long and displaced 127 tonnes ( 125 long tons ) when surfaced and 142 tonnes ( 140 long tons ) while submerged . All had two bow torpedo tubes and two torpedoes , and were equipped with a deck @-@ mounted machine gun . 
- In 1918 four of the surviving German boats were converted into coastal minelayers . Of the seventeen boats in German service , two were sold to Austria @-@ Hungary , one was sold to Bulgaria , and nine were lost during the war . One of the five Austro @-@ Hungarian boats was sunk and another mined and not repaired . The five surviving German boats , the four surviving Austro @-@ Hungarian boats , and the Bulgarian boat were all turned over to the Allies after the end of the war and were broken up . 
- 
- = = Design = = 
- 
- In the earliest stages of the First World War the German Army 's rapid advance along the North Sea coast found the German Imperial Navy without submarines suitable to operate in the narrow and shallow seas off Flanders . By 18 August 1914 , two weeks after the German invasion of Belgium , the planning of a series of small coastal submarines had already begun . 
- The German Imperial Navy stipulated that the submarines must be transportable by rail , which imposed a maximum diameter of 3 @.@ 15 metres ( 10 ft 4 in ) . The rushed planning effort — which had been assigned the name " Project 34 " — resulted in the Type UB I design , created specifically for operation from Flanders . The boats were to be about 28 metres ( 92 ft ) long and to displace about 125 tonnes ( 123 long tons ) with two bow torpedo tubes . 
- Boats of the Type UB I design were built by two manufacturers , Germaniawerft of Kiel and AG Weser of Bremen , which led to some variations in boats from the two shipyards . The eight Germaniawerft @-@ built boats were slightly longer at 28 @.@ 10 metres ( 92 ft 2 in ) length overall , while the twelve Weser @-@ built boats came in 22 centimetres ( 8 @.@ 7 in ) shorter than their counterparts . All were 3 @.@ 15 metres ( 10 ft 4 in ) abeam and had a draft of 3 @.@ 03 metres ( 9 ft 11 in ) . The boats all displaced 127 tonnes ( 125 long tons ) while surfaced , but differed slightly in displacement submerged . The slightly longer Germaniawerft boats displaced 142 tonnes ( 140 long tons ) while submerged , as they weighed 1 tonne ( 0 @.@ 98 long tons ) more than the Weser boats . 
- The drivetrain of the boats consisted of a single propeller shaft driven by a Daimler ( Germaniawerft ) or Körting ( Weser ) diesel engine on the surface , or a Siemens @-@ Schuckert electric motor for underwater travel . The Weser boats were capable of nearly 7 @.@ 5 knots ( 13 @.@ 9 km / h ; 8 @.@ 6 mph ) on the surface and a little more than 6 knots ( 11 km / h ; 6 @.@ 9 mph ) submerged . The Germaniawerft boats were about 1 knot ( 1 @.@ 9 km / h ; 1 @.@ 2 mph ) slower than their Bremen @-@ made counterparts . The boats were equipped with two 45 @-@ centimetre ( 17 @.@ 7 in ) bow torpedo tubes and carried two torpedoes . They were also armed with a single 8 @-@ millimetre ( 0 @.@ 31 in ) machine gun affixed to the deck . 
- 
- = = Construction = = 
- 
- The German Imperial Navy ordered its first fifteen Type UB I boats on 15 October 1914 . Eight boats — numbered UB @-@ 1 to UB @-@ 8 — were ordered from Germaniawerft of Kiel , and seven boats — numbered UB @-@ 9 to U @-@ 15 — from AG Weser of Bremen . After two of the class , UB @-@ 1 and UB @-@ 15 , were sold in February 1915 to ally Austria @-@ Hungary ( becoming U @-@ 10 and U @-@ 11 in the Austro @-@ Hungarian Navy ) , the German Imperial Navy ordered UB @-@ 16 and UB @-@ 17 from Weser . A further three for Austria @-@ Hungary — U @-@ 15 , U @-@ 16 , and U @-@ 17 — had been ordered from Weser by April , bringing the total number constructed to 20 . 
- UB @-@ 1 and UB @-@ 2 were laid down on 1 November 1914 at the Germaniawerft yard at Kiel . UB @-@ 1 was launched on 22 January 1915 , just 75 working days later . UB @-@ 2 's launch followed on 13 February . Among the Weser boats , UB @-@ 9 was laid down first , on 6 November 1914 , and launched on 6 February 1915 , a week ahead of UB @-@ 2 . These first three boats launched underwent trials in home waters , but most of the other members of the class were shipped via rail and underwent trials at their assembly point . 
- The process of shipping the submarines by rail involved breaking the submarines down into what was essentially a knock down kit . Each boat was broken into approximately fifteen pieces and loaded on to eight railway flatcars . Type UB I boats destined for service with the Flanders Flotilla made a five @-@ day journey to Antwerp for the two- to three @-@ week assembly process . After assembly at Antwerp the boats were towed by barge to Bruges for trials . Boats selected for service in the Mediterranean were sent to the Austro @-@ Hungarian port of Pola for assembly . The total time from departure of the railcars from the shipyard to operational readiness for the boats was about six weeks . 
- By July 1915 all seventeen of the German Imperial Navy Type UB Is had been completed . 
- 
- = = Service = = 
- 
- During their trials the Type UB Is were found to be too small and too slow and had a reputation for being underpowered ; one commander compared his Type UB I to a " sewing machine " . According to authors R. H. Gibson and Maurice Prendergast in their 1931 book The German Submarine War , 1914 – 1918 , the UBs did not have enough power to chase down steamers while surfaced and lacked the endurance to spend any extended amount of time underwater , exhausting their batteries after little over an hour 's running . In @-@ service use revealed another problem : with a single propeller shaft / engine combination , if either component failed , the U @-@ boat was almost totally disabled . 
- Another reported problem with the Type UB Is was the tendency to break trim after the firing of torpedoes . The boats were equipped with compensating tanks designed to flood and offset the loss of the C / 06 torpedo 's 1 @,@ 700 @-@ pound ( 770 kg ) weight , but this system did not always function properly ; as a result , when firing from periscope depth the boat could broach after firing or , if too much weight was taken on , plunge to the depths . When UB @-@ 15 torpedoed and sank Italian submarine Medusa in June 1915 , the tank failed to properly compensate , forcing the entire crew to run to the stern to offset the trim imbalance . 
- Despite the problems , the " tin tadpoles " , as the Germans referred to them , were in active service from March 1915 through the end of the war , with half of the 20 boats lost during the war . Boats of the class served in three navies : the German Imperial Navy , the Austro @-@ Hungarian Navy , and the Bulgarian Navy . In German service , they served primarily in the Flanders Flotilla , the Baltic Flotilla , and the Constantinople Flotilla . 
- 
- = = = German Imperial Navy = = = 
- 
- 
- = = = = Flanders Flotilla = = = = 
- 
- The first Type UB I to enter service was UB @-@ 10 , which formed the nucleus of the Flanders Flotilla , on 27 March 1915 . By the end of April five more Type UB I boats had become operational . UB @-@ 10 was eventually joined in the Flanders Flotilla by UB @-@ 2 , UB @-@ 4 , UB @-@ 5 , UB @-@ 6 , UB @-@ 12 , UB @-@ 13 , UB @-@ 16 , and UB @-@ 17 ; of these , only UB @-@ 2 made the journey to Flanders by sea rather than rail . 
- UB @-@ 4 departed on the first patrol from Flanders on 9 April , and was responsible for sinking the first ship sent down by the flotilla . The Type UB I boats of the Flanders Flotilla originally patrolled the area between the United Kingdom and the Netherlands , but began patrolling the English Channel after UB @-@ 6 pioneered a route past British antisubmarine nets and mines in the Straits of Dover in late June . 
- Over the Type UB Is ' first year of service , UB @-@ 4 and UB @-@ 13 were both lost , and UB @-@ 2 and UB @-@ 5 were transferred to the Baltic Flotilla . In March 1917 , UB @-@ 6 ran aground in Dutch waters and was interned for the rest of the war , along with her crew . The four remaining Type UB Is in Flanders — UB @-@ 10 , UB @-@ 12 , UB @-@ 16 , UB @-@ 17 — were all converted to minelayers by 1918 , having their torpedo tubes removed and replaced with chutes to carry up to eight mines . All but UB @-@ 10 were lost in 1918 ; UB @-@ 10 , in poor repair and out of service , was scuttled in October 1918 when the Germans evacuated from Flanders . 
- 
- = = = = Baltic Flotilla = = = = 
- 
- UB @-@ 9 was initially assigned to the Baltic Flotilla , and was joined by UB @-@ 2 and UB @-@ 5 in early 1916 . All three became training boats at Kiel in 1916 , joining UB @-@ 11 in that duty . Little information is available about the Type UB I boats operating in the Baltic . 
- 
- = = = = Constantinople Flotilla = = = = 
- 
- Four of the German Imperial Navy boats — UB @-@ 3 , UB @-@ 7 , UB @-@ 8 , and UB @-@ 14 — were selected for service with the Constantinople Flotilla . All were sent to Pola for assembly and trials there as part of the Pola Flotilla before sailing on to join the Constantinople Flotilla . UB @-@ 3 disappeared en route to Constantinople in May 1915 , but the other three arrived there by mid @-@ June . 
- The three Type UB I boats of the Constantinople Flotilla seem to have patrolled primarily in the Black Sea . UB @-@ 8 was transferred to the Bulgarian Navy in May 1916 , and UB @-@ 7 disappeared in the Black Sea in October 1916 , leaving UB @-@ 14 as the sole remaining German Type UB I in the flotilla ; she was surrendered at Sevastopol in November 1918 to French armies stationed there during the Russian Civil War . 
- 
- = = = Austro @-@ Hungarian Navy = = = 
- 
- UB @-@ 1 and the still incomplete UB @-@ 15 were sold to the Austria @-@ Hungary in February 1915 ; both were dismantled and shipped to Pola in May . After one cruise under the German flag , each boat was commissioned into the Austro @-@ Hungarian Navy . The pair — renamed U @-@ 10 and U @-@ 11 , respectively — were joined by U @-@ 15 , U @-@ 16 , and U @-@ 17 in October . Known as the U @-@ 10 or the Okarina ( English : Ocarina ) class as a part of the Austro @-@ Hungarian Navy , the five boats operated primarily in the Adriatic in patrols off Italy and Albania . U @-@ 10 ( ex UB @-@ 1 ) hit a mine in July 1918 and was beached , but had not been repaired by the end of the war . U @-@ 16 was sunk after she torpedoed an Italian destroyer in October 1916 , and the remaining three ( and the unrepaired U @-@ 10 ) were ceded to Italy at the end of the war . 
- 
- = = = Bulgarian Navy = = = 
- 
- After UB @-@ 8 was transferred to the Bulgarian Navy in May 1916 , she was renamed Podvodnik No. 18 ( in Cyrillic : Пoдвoдник No. 18 ) . She was Bulgaria 's first submarine , and was engaged primarily in coastal defense duties off Bulgaria 's main Black Sea port of Varna . Podvodnik No. 18 survived the war and was ceded to France after the Treaty of Neuilly @-@ sur @-@ Seine . 
- 
- = = List of Type UB I submarines = = 
- 
- 20 Type UB I submarines were built , 17 for the German Imperial Navy and three for the Austro @-@ Hungarian Navy . Two of the German submarines — UB @-@ 1 and UB @-@ 15 — were sold to Austria @-@ Hungary and commissioned into the Austro @-@ Hungarian Navy as U @-@ 10 and U @-@ 11 , respectively . Those two and a further three built by AG Weser comprised the virtually identical U @-@ 10 class for the Austro @-@ Hungarian Navy . Another of the German submarines , UB @-@ 8 , was sold to Bulgaria in May 1916 , becoming Podvodnik No. 18 . 
- 
- = = = German Imperial Navy = = = 
- 
- SM UB @-@ 1 ( became the Austro @-@ Hungarian U @-@ 10 , July 1915 ) 
- SM UB @-@ 2 
- SM UB @-@ 3 
- SM UB @-@ 4 
- SM UB @-@ 5 
- SM UB @-@ 6 
- SM UB @-@ 7 
- SM UB @-@ 8 ( became the Bulgarian Podvodnik No. 18 , May 1916 ) 
- SM UB @-@ 9 
- SM UB @-@ 10 
- SM UB @-@ 11 
- SM UB @-@ 12 
- SM UB @-@ 13 
- SM UB @-@ 14 
- SM UB @-@ 15 ( became the Austro @-@ Hungarian U @-@ 11 , June 1915 ) 
- SM UB @-@ 16 
- SM UB @-@ 17 
- 
- = = = Austro @-@ Hungarian Navy = = = 
- 
- In the Austro @-@ Hungarian Navy the Type UB I boats were known as the U @-@ 10 class , which consisted of two former German Type UB I boats and three built specifically for Austria @-@ Hungary . 
- SM U @-@ 10 ( the former German UB @-@ 1 ) 
- SM U @-@ 11 ( the former German UB @-@ 15 ) 
- SM U @-@ 15 ( Austria @-@ Hungary ) 
- SM U @-@ 16 ( Austria @-@ Hungary ) 
- SM U @-@ 17 ( Austria @-@ Hungary ) 
- In addition , four of the German Type UB Is assigned to the Pola Flotilla based at the Austro @-@ Hungarian Navy 's main naval base at Pola were assigned Austro @-@ Hungarian designations . 
- SM UB @-@ 3 ( as U @-@ 9 ) 
- SM UB @-@ 7 ( as U @-@ 7 ) 
- SM UB @-@ 8 ( as U @-@ 8 ) 
- SM UB @-@ 14 ( as U @-@ 26 ) 
- These four boats remained under commission in the German Imperial Navy , retained German crews and commanders , and received orders from the German flotilla commander at Pola . 
- 
- = = = Bulgarian Navy = = = 
- 
- Germany and Bulgaria negotiated the purchase of two UB I boats for the Bulgarian Navy , UB @-@ 7 and UB @-@ 8 , in 1916 . Two crews of Bulgarian sailors were sent to Kiel for training . Before the purchase could be completed , UB @-@ 7 was sunk , leaving only one boat for Bulgaria . On 25 May 1916 , UB @-@ 8 was officially transferred to Bulgaria for the remainder of the war . 
- Podvodnik No. 18 ( the former German UB @-@ 8 ) 
- Key 
- 
- 
- = Military history of Gibraltar during World War II = 
- 
- The military history of Gibraltar during World War II exemplifies Gibraltar 's position as a British fortress since the early 18th century and as a vital factor in British military strategy , both as a foothold on the continent of Europe , and as a bastion of British sea power . During World War II , Gibraltar served a vital role in both the Atlantic Theatre and the Mediterranean Theatre , controlling virtually all naval traffic into and out of the Mediterranean Sea from the Atlantic Ocean . 
- In addition to its commanding position , Gibraltar provided a strongly defended harbour from which ships could operate in both the Atlantic and the Mediterranean . Force H , under the command of Vice @-@ Admiral James Somerville was based in Gibraltar and had the task of maintaining naval superiority and providing a strong escort for convoys to and from the besieged island of Malta . During the course of the war , Gibraltar came under aerial bombardment from Vichy French aircraft and from aircraft of the Italian Royal Air Force ( Regia Aeronautica ) based on Sardinia . Additionally , the fortress was the focus of underwater attacks by the Italian Royal Navy ( Regia Marina ) commando frogman unit ( Decima Flottiglia MAS ) and their human torpedoes . This Italian unit was based on the interned Italian ship SS Olterra in the nearby Spanish harbour of Algeciras . A number of attacks were also carried out by Spanish and Gibraltarian agents acting on behalf of the German Abwehr . 
- Inside the Rock of Gibraltar itself , miles of tunnels were excavated from the limestone . Masses of rock were blasted out to build an " underground city " . In huge man @-@ made caverns , barracks , offices , and a fully equipped hospital were constructed , complete with an operating theatre and X @-@ ray equipment . 
- Operation Torch , the Allied invasion of French North Africa in November 1942 , was coordinated from the " Rock " . General Dwight D. Eisenhower , who was given command of the operation , set up his headquarters in Gibraltar during the planning phases of the operation . Following the successful completion of the North African campaign and the surrender of Italy in 1943 , Gibraltar 's role shifted from a forward operating base to a rear @-@ area supply position . The harbour continued to operate dry docks and supply depots for the convoy routes through the Mediterranean until V @-@ E Day in 1945 . 
- 
- = = Prelude and evacuation = = 
- 
- World War II dramatically changed the lives of Gibraltarians . The decision to enforce mass evacuation in order to increase the strength of the Rock with more military and naval personnel meant that most Gibraltarians ( some for up to ten years ) had nowhere to call ' home ' . Only those civilians with essential jobs were allowed to stay but it gave the entire community a sense of being ' British ' by sharing in the war effort . 
- In early June 1940 , about 13 @,@ 500 evacuees were shipped to Casablanca in French Morocco . However , following the capitulation of the French to the German armies later in June 1940 , the new Pro @-@ German French Vichy Government found the presence of Gibraltarian evacuees in Casablanca an embarrassment and sought opportunities for their removal . The opportunity soon arose when 15 British cargo vessels arrived under Commodore Crichton , repatriating 15 @,@ 000 French servicemen who had been rescued from Dunkirk . Once their own rescued servicemen had disembarked , the ships were interned until they agreed to take away all the evacuees . Although Crichton was unable to obtain permission to clean and restock his ships ( and contrary to British Admiralty orders which forbade the taking on of evacuees ) , when he saw the mass of civilians pouring through the dockyards , he opened up his gangways for boarding . Just beforehand , the British fleet had destroyed a number of French warships at Mers el @-@ Kebir in order to prevent them ending up in German hands . The attack , during which 1 @,@ 297 French sailors died , led to high tensions , which were evident when families were forced at bayonet point by French troops to board taking only what they could carry , leaving many possessions behind . However , when they arrived at Gibraltar , the Governor would not allow them to land , fearing that once the evacuees were back on the Rock , it would be virtually impossible to evacuate them a second time . Crowds gathered in John Mackintosh Square in the centre of Gibraltar as the news broke , speeches were made and two City Councillors accompanied by the Acting President of the Exchange and Commercial Library went to see the Governor ( Sir Clive Liddell ) to ask that the evacuees be allowed to land . After receiving instructions from London , a landing was allowed as long as the evacuees returned when other ships arrived to take them away from the Rock , and by 13 July the re @-@ evacuation back to Gibraltar had been completed . 
- British conservative politician Oliver Stanley agreed to accept the evacuees in the United Kingdom , but he argued with Gibraltar over the number of people involved . The Governor , he declared , had given the number of evacuees first as 13 @,@ 000 , then as 14 @,@ 000 and finally as 16 @,@ 000 . He asked for the situation to be clarified , stressing the shortage of accommodation in Britain and insisting that only 13 @,@ 000 could be accepted , 2 @,@ 000 of whom were to be sent to the Portuguese Atlantic island of Madeira . The situation , replied General Liddell on 19 July , " is that this is a fortress liable to heavy and immediate attack and there should be no civilians here whereas there are 22 @,@ 000 . The 13 @,@ 000 was the number sent to Morocco , and more would have been sent had the situation there not altered . " In London the evacuees were placed in the hands of the Ministry of Health , and many were housed in Kensington area . Concern for them in Gibraltar mounted as the air raids against London intensified , coupled with the arrival of harrowing letters , describing the circumstances in which the evacuees were living . 
- In September rumours were already circulating among the evacuees , and in Gibraltar , that the possibility of re @-@ evacuating the Gibraltarians once more was being mooted , this time the destination being Jamaica , in the West Indies . After much contention , it was decided to send a party directly from Gibraltar to the island , and 1 @,@ 093 evacuees left for Jamaica direct , on 9 October , with more following later on . However petitions followed and the demands were met , partly for strategic reasons and the lack of available shipping . The situation at the end of 1940 , therefore , was that approximately 2 @,@ 000 evacuees were in Jamaica and a lesser number in Madeira , with the bulk of around 10 @,@ 000 housed in the London area . 
- 
- = = Royal Air Force involvement : 1939 – 1941 = = 
- 
- Construction of a solid surface runway began in late 1939 and in 1940 it was proposed to extend the existing runway to a length of 1 @,@ 550 yards ( 1 @,@ 417 m ) . The land reclamation commenced towards the end of 1941 along with the construction of an RAF camp at the " North Front " , now RAF Gibraltar . The RAF dispatched their next squadron to Gibraltar at this time and it was in September 1939 that war with Germany was declared and the strong possibility of German submarines concentrating in the Strait of Gibraltar and using Spanish port facilities , loomed large in Admiralty thinking . So at 09 : 00 ( UTC ) on the 9 September 1939 , No. 202 Squadron RAF was ordered to Gibraltar , loaded to the gunwales with equipment . 
- On 25 September 1939 , No 200 ( Coastal ) Group was formed as a subordinate formation to HQ RAF Mediterranean in control of No 202 Sqn . The Group 's function was the control of Royal Air Force units operating from Gibraltar . In late 1940 the Group was transferred to Coastal Command . Later a combined HQ was formed which commenced operations in early 1942 . 
- 
- = = Vichy French attacks : 1940 = = 
- 
- On 18 July 1940 , after the attack on the French Fleet at Mers @-@ el @-@ Kébir by the British , the Vichy government authorized a bombing raid of Gibraltar as a response . Little damage was reported to have been done . 
- On Tuesday , 24 September , the Italian Stefani news agency reported : " As a reprisal for the bombardment of Dakar yesterday morning , one @-@ hundred @-@ and @-@ twenty French aircraft based in Morocco attacked Gibraltar . " On the same day , the United Press Agency reported : " The French government has issued an official denial of reports , according to which French aircraft were said to have attacked Gibraltar . Up until now , no reprisals have been undertaken . " But the United Press report ended on an ominous note with : " French reprisals are imminent . " 
- Again , on the same day , the Vichy French government issued orders for the naval base and city of Gibraltar to be bombarded . As a result , six bomber squadrons of the Vichy French Air Force ( Armée de l 'Air de Vichy ) and four squadrons of the Vichy French Navy ( Marine nationale de Vichy ) were employed in the operation . The 64 bombers flew from bases in Oran , Tafaroui ( in Algeria ) , Meknes , Mediouna , and Port Lyautey ( in Morocco ) . The French action was approved by both the German Armistice Commission and the Italian Armistice Commission . 
- No British aircraft were encountered and much damage was done in the area south of the fortress . The South Mole and a large ship in the harbour were heavily damaged . In the northern part of Gibraltar , fires broke out . 
- On 25 September , the French returned with a larger force of eighty @-@ three bombers to cause additional damage to the naval base and harbour installations . Again , aircraft of the British Royal Air Force made no appearance . However , the French crews did report encountering heavy anti @-@ aircraft fire . One LeO 451 bomber was lost and 13 other aircraft were lightly damaged during the two days of bombing attacks . The British armed trawler HMT Stella Sirius was sunk by bombs . 
- The air attack on 25 September was the last by Vichy forces on Gibraltar . 
- 
- = = Operation Felix : 1940 – 1941 = = 
- 
- The Rock came through the war relatively unscathed but , given its strategic importance , Germany made plans to capture Gibraltar . Codenamed " Felix " , the plan which was signed by Adolf Hitler himself was formulated at the highest level of command . With or without permission , Germany would take entry through Spain and attack Gibraltar driving the British out of the Western Mediterranean . The Strait would be effectively closed to the Allies once Gibraltar was in German hands , forcing Asia @-@ bound Allied shipping to steam all the way around Africa rather than to proceed to the east via the shorter route through the Mediterranean and the Suez Canal . The Rock was to be heavily dive bombed by planes leaving France but landing afterward at Spanish air bases . To deny a possible Spanish capture of the base , the German planners decided that the final attack to seize Gibraltar was to be made by German troops alone . 
- Diplomatic failure at the highest levels of government prevented the operation from occurring at the beginning of 1941 which had been drawn up in detail by the Wehrmacht in the summer and autumn of 1940 . 
- General Ludwig Kübler 's XLIX Corps would conduct the actual attack on the Rock . The assault forces would comprise the Infantry Regiment Großdeutschland , the 98th Regiment of the 1st Mountain Division , 26 medium and heavy artillery battalions , three observation battalions , three engineer battalions , two smoke battalions , a detachment of 150 Brandenburgers , and up to 150 miniature remote controlled demolition vehicles ( Goliaths ) , packed with high explosives . 
- As part of a combined @-@ force operation , the German Air Force ( Luftwaffe ) would contribute Ju 88As , Stukas , Messerschmitts , three light AA battalions , and three heavy AA battalions . Nazi Germany 's Kriegsmarine would cooperate by using U @-@ boats to interfere with British naval movement and emplacing coastal batteries to further discourage the Royal Navy . 
- On 10 March 1941 , with Operation Barbarossa looming , Felix was amended to Operation Felix @-@ Heinrich , whereby German troops would be withdrawn from the USSR to capture Gibraltar . As a result of Spanish dictator Francisco Franco 's intransigence , the operation was postponed , modified , and ultimately abandoned . 
- 
- = = Italian bombing of Gibraltar = = 
- 
- From Sardinia , Italian Piaggio P.108 bombers attacked Gibraltar several times , mainly in 1942 . The last raids on Gibraltar were done during the 1943 Allied landing in Algeria , when those bombers hit successfully even the Oran port . 
- The only unit of the Regia Aeronautica ( Royal Air Force ) ever to fly the Piaggio P.108 was the " 274th Long @-@ Range Bombardment Squadron " . This unit was formed in May 1941 around the first machines that came off the assembly lines . The training of the crews lasted far longer than anticipated and only in June 1942 the 274th became operational . The most spectacular raids with the P. 108 bombers were flown in October 1942 when several night attacks against Gibraltar were undertaken from Sardinia . 
- After the armistice of Cassibile ( 8 September ) , the German @-@ allied Italian Social Republic launched at least two raids on Gibraltar : one on the night of 4 – 5 June 1944 with ten SM.79bis aircraft and another on 6 June with nine aircraft . Both sorties were undertaken by the Gruppo Aerosiluranti " Buscaglia – Faggioni " . 
- 
- = = Italian frogmen raids 1940 – 1943 = = 
- 
- Known as the " Floating Trojan Horse of Gibraltar " , Decima Flottiglia MAS , an Italian commando frogman unit created during the Fascist government , engaged in numerous attacks against the harbour at Gibraltar . 
- Gibraltar was a very tempting target for the Italians , who saw it as a refuge for British warships and allied merchant shipping . The Italian frogmen originally used a Spanish villa ( Villa Carmela ) located two miles ( 3 km ) from Gibraltar owned by an Italian officer who had married a Spanish woman named Conchita Ramognino . Their base was shifted later to the Italian tanker SS Olterra , interned in Algeciras . 
- 
- = = Abwehr saboteurs from Spain = = 
- 
- Lesser known than the Italian actions were the sabotage operations and limpet @-@ mine attacks carried out by Spanish and Gibraltarian agents recruited in the Campo de Gibraltar by the Germans . The Abwehr contacted a Spanish staff officer from Campo de Gibraltar , Lieutenant Colonel Eleuterio Sánchez Rubio , a Spanish officer , member of the Falange and coordinator of the intelligence operations in the Campo , to establish a network of saboteurs with access to Gibraltar . Sánchez Rubio designated Emilio Plazas Tejera , also a member of Falange , as operations chief of the organisation . Most of the recruits for the sabotage operations were Spaniards from the Campo . A combination of financial reward , ideological commitment and some threats and intimidation were used to gather a significant number of agents . According to the British intelligence , there were at least 183 Spaniards and Gibraltarians involved in the espionage and sabotage operations against Gibraltar . 
- Sabotage operations were ordered from Berlin in the late autumn of 1940 , but actual work did not start until early 1941 . The first operations were unsuccessful . A first attempt to smuggle a bomb into Gibraltar was aborted , as the timing device was faulty . In February there was a large explosion in the North Tunnel , and in April a bomb blew up near the airfield . In June 1941 , however , the British intelligence foiled a new attempt , by a German agent , to attach a mine alongside an Allied cargo ship . Another attempt failed when Plazas placed a bomb inside an ammunition store but was not able to bring the explosive . It was not until 1942 that the operations begun to succeed . In January 1942 , two Spanish agents manage to destroy two aircraft at the North Front landing strip . 
- Financed , trained and equipped by the Germans , the saboteurs sank the armed trawler HMT Erin , and destroyed the auxiliary minesweeper HMT Honju , which resulted in the deaths of six British seamen on 18 January 1942 . Plazas was assisted by the Spanish naval commander of Puente Mayorga , Manuel Romero Hume , who allowed him to beach a rowboat there . The British intelligence was able however to counteract the sabotage operations . In March 1942 , a Gibraltarian , José Key , one of the most prominent agents working for the Germans , responsible for the collection of information on military movements for the Abwehr was arrested and executed in Wandsworth Prison in late 1942 . By September 1942 , Plazas , whose activities were closely monitored by the British at that time , resigned and left Carlos Calvo , his second in command , in charge of the operations . In late 1942 , the German headquarters in Berlin ordered the sabotage operations being expanded . In early 1943 , the arrival of an experienced head of Abwehr operations in Spain improved the outreach of the operations . 
- In March 1943 an ammunition dump was blown up by Calvo 's agents . The British , growing suspicious of some of the saboteurs , banned them from entering Gibraltar . This forced the Abwehr to ask Calvo for new personnel . A Spaniard working on the Rock , José Martín Muñoz , was responsible for the explosion and fire at a large fuel tank at Coaling Island on 30 June 1943 ; this mission , however , would be the first and the last for Muñoz , because he was cornered and arrested by British authorities in August , when he tried to smuggle a bomb into a weapons magazine inside Ragged Staff Cave . After being sentenced to death , he was hanged on 11 January 1944 in Gibraltar by British executioner Albert Pierrepoint . A member of an unrelated Abwehr sabotage network , Luis López Cordón @-@ Cuenca ( also arrested in 1943 ) was executed by Pierrepoint on the same day . Calvo himself was put under arrest by the Spanish police and neutralized . He would be a free man again in December , when he rejoined the Abwehr in Madrid , under direct orders of Wolfgang Blaum , aka Baumann , head of the sabotage section in Spain . After a Falangist attempt against the life of pro @-@ allied General José Enrique Varela , perpetrated by Sánchez Rubio network 's agent Juan José Domínguez and a meeting between Anthony Eden and the Spanish ambassador at London , Jacobo Fitz @-@ James Stuart , Abwehr activities around Gibraltar came to an end . 
- 
- = = Operation Tracer : 1941 – 1942 = = 
- 
- Operation Tracer was a top @-@ secret British stay @-@ behind spying mission that was only to be implemented if Gibraltar was captured by the Axis Powers . Six men were to be sealed in a cave and left with only enough supplies for a year . The volunteers — two doctors , three signalmen and their leader — would run an observation post with one 12 @-@ inch ( 300 mm ) by 6 @-@ inch ( 150 mm ) slit looking over the harbour and a concealed outdoor terrace over the Mediterranean . The team would then wire back all shipping movements to the British Admiralty . 
- They were told there would be no way out and anyone who died within the chamber would have to be embalmed and cemented into the brick floor . Only if Germany was defeated within their first year would they be released . 
- As the threat of invasion was clearly felt in late 1941 , an idea for a series of secret observation posts ( first in Gibraltar and later in other places like Malta and Aden ) was put together under Operation Tracer . 
- Work in Gibraltar began immediately under Commander Geoffrey Birley and his chief engineer Colonel Fordham . The site chosen at Lord Airey 's Battery on the southern tip of the Rock already had an existing tunnelling scheme for a shelter . Extensive trials of the equipment began in January 1942 under the eye of MI6 radio expert Colonel Richard Gambier @-@ Parry . Much thought was also given to the type of men needed for such a strange and demanding task . A member of Scott ’ s ill @-@ fated expedition to the Antarctic , George Murray Levick was called up as Surgeon @-@ Commander to advise on survival techniques . There were practical matters such as diet , exercise , sanitation , and clothing to consider as well as vital " psychology of the personnel " . The full team was in place by the end of summer 1942 and their cavern fully equipped and ready for occupation . A comprehensive manual was prepared on all aspects of the operation and it was considered that similar secret lookout posts should be prepared throughout the world in the event of future wars . However , Operation Tracer was never needed , as Adolf Hitler turned his attention away from Gibraltar and towards the Eastern Front . 
- The operation had been clouded in mystery until the discovery of papers at the Public Record Office in Kew UK . Previously in the 1960s , details of the story were told to a journalist by his intelligence service contacts and he wrote these up as " Operation Monkey " , yet facts were very sparse . 
- In 1997 " Stay Behind Cave " ( as it was nicknamed ) was discovered in Gibraltar by the Gibraltar Caving Group , but no account was ever obtained from anyone associated with the mission . The discovery came about when the group encountered a strong gust of wind in a tunnel . Further searching led them to break through a wall into chambers which had never been used and had remained sealed for over 50 years . 
- In November 2006 Jim Crone and Sergeant Major Pete Jackson , senior tunnel guide with the Royal Gibraltar Regiment , met possibly the only member of Operation Tracer still alive when they travelled to meet Dr. W. A. Bruce Cooper at his home in England . Cooper , 92 at the time , provided an opportunity to shed light on the operation with his direct involvement in the mission as a Surgeon @-@ Lieutenant in the Royal Navy Volunteer Reserve ( RNVR ) . He recalled stories about his colleagues , his training , and his feelings about the task . 
- 
- = = Mediterranean U @-@ boat Campaign : 1941 – 1944 = = 
- 
- The Mediterranean U @-@ boat Campaign lasted approximately from 21 September 1941 to May 1944 . The Kriegsmarine tried to isolate Gibraltar , Malta , and Suez and disrupt Britain 's trade routes . More than sixty U @-@ boats were sent to interdict Allied shipping in the Mediterranean Sea . Many of these U @-@ boats were themselves attacked negotiating the Strait of Gibraltar controlled by Britain . Nine U @-@ boats were sunk while attempting passage and ten more were damaged . 
- 
- = = North African Campaign : 1942 = = 
- 
- Plans for the Allied counter offensive after the attack on Pearl Harbor were ongoing by mid @-@ 1942 . An invasion of Europe in 1943 would be unworkable , but the allies could attack the " soft underbelly of Europe " through the Mediterranean , as Prime Minister Winston Churchill put it . Devised by President Franklin Roosevelt and Churchill and code named Operation Torch , the plan was to occupy French North Africa : Morocco , Algeria , and Tunisia . From these French colonies , attacks could be launched that would drive Italy out of the war . 
- In July 1942 , Lieutenant General Dwight D. Eisenhower was appointed Allied Commander @-@ in @-@ Chief of Operation Torch . Churchill placed Gibraltar under the command of General Eisenhower as the temporary headquarters for this , the first large @-@ scale Anglo @-@ American operation of the war . He arrived in Gibraltar on 5 November 1942 to take over , not just command of Operation Torch itself , but also military command of Gibraltar . 
- General Eisenhower stayed at The Convent , the official Governor 's residence , but his operational headquarters were in a small chamber in a tunnel in the heart of the Rock . In his memoirs General Eisenhower wrote : 
- The subterranean passages under the Rock provided the sole available office space , and in them was located the signal equipment by which we expected to keep in touch with the commanders of the three assault forces . The eternal darkness of the tunnels was here and there partially pierced by feeble electric bulbs . Damp , cold air in block @-@ long passages was heavy with stagnation and did not noticeably respond to the clattering efforts of electric fans . Through the arched ceilings came a constant drip , drip , drip of surface water that faithfully but drearily ticked off the seconds of the interminable , almost unendurable , wait which always occurs between completion of a military plan and the moment action begins . 
- One hundred thousand soldiers on the high seas in a multitude of transports converged on Gibraltar . More than 400 aircraft of all types were crammed into the dispersal areas around the Gibraltar runway . Fighters had been shipped in crates and assembled on the airfield . Every available area of storage was taken up with ammunition , fuel , and other essential supplies . 168 American pilots were housed in the RAF messes at North Front . 
- On 8 November 1942 , 466 aircraft from Gibraltar landed on captured North African airfields . 
- From their headquarters in Gibraltar , General Eisenhower and Admiral Sir Andrew Browne Cunningham directed Operation Torch , the first major combined combat operation during World War II involving American and British forces . 
- 
- = = = War tunnels = = = 
- 
- Given that Gibraltar was a small town with only a few defences protecting it , the solution was to build a massive series of tunnels and chambers inside the natural protection of the Rock of Gibraltar . This " town " inside the Rock contained its own power station , water supply , and hospital . Some soldiers posted here would not see the light of day for months on end . Two Canadian engineer companies , the only soldiers with diamond @-@ tipped drills and 5 British engineer companies , added some 30 miles ( 48 km ) of such tunnels , a feat thought impossible at the time . That was enough to hold all 30 @,@ 000 troops on the rock . Today , the rock has more underground tunnels than roads . 
- 
- = = Death of Władysław Sikorski : 1943 = = 
- 
- On 4 July 1943 , a Liberator bomber from RAF Transport Command took off from Gibraltar for England . On board was General Władysław Sikorski , Prime Minister of Poland 's London @-@ based government in exile and Commander @-@ in @-@ Chief of its armed forces , returning from visiting Polish troops in the Middle East . 
- The aircraft climbed normally from the runway , levelled off to gather speed but then suddenly lost height and crashed into the harbour . The 62 @-@ year @-@ old general died , along with 15 others . The sole survivor was the Czech @-@ born pilot , Eduard Prchal , who was rescued by an RAF launch . The bodies of five passengers and crew , including Sikorski 's daughter , were never found . 
- The coffins of General Sikorski and his Chief @-@ of @-@ Staff , General Kilimecki , were draped in the Polish National Flag and lay in state in the Cathedral of St. Mary the Crowned . After a Requiem Mass , the bodies were carried in procession to the H.M. Dockyard with full Military Honours to be shipped to London in anticipation that General Sikorski 's remains would one day be returned to a liberated Poland . The route to the dockyard was lined by British troops and the coffins carried and escorted by Polish Servicemen . 
- 
- = = = Investigation = = = 
- 
- In 1943 a British Court of Inquiry investigated the crash of Sikorski 's Liberator II AL523 , but was unable to determine the probable cause , finding only that it was an accident and the " aircraft became uncontrollable for reasons which cannot be established " . A popular theory was insufficient technical maintenance leading to jamming aircraft controls . Despite this finding , the political context of the event , coupled with a variety of curious circumstances , immediately gave rise to speculation that Sikorski 's death had been no accident , and may in fact have been the direct result of a Soviet , British or even Polish conspiracy . 
- 
- = = Aftermath = = 
- 
- The surrender of Italy in September 1943 lifted any possible objections to the return of the evacuees to the Rock . As a result , a Resettlement Board was established in November , and at a meeting of the Board on 8 February 1944 repatriation priorities were finally agreed . On 6 April 1944 the first group of 1 @,@ 367 repatriates arrived on the Rock directly from the United Kingdom and on 28 May , the first repatriation party left Madeira , and by the end of 1944 only 520 non @-@ priority evacuees remained on the island . 
- In London , home @-@ comers were making claims on the evacuees ’ wartime accommodation and 500 Gibraltarians were re @-@ evacuated to Scotland and 3 @,@ 000 to camps in Northern Ireland . Although the Governor , Lt. General Sir Noel Mason @-@ MacFarlane , fought valiantly on behalf of the evacuees and did not accept the lack of accommodation as a sufficient reason for the delays . As late as 1947 there were still 2 @,@ 000 in Northern Irish camps . The last of the evacuees did not see the Rock again until 1951 . 
- 
- = = See Also = = 
- 
- Military history of the British Commonwealth in the Second World War 
- 
- 
- = Nerva = 
- 
- Nerva ( Latin : Marcus Cocceius Nerva Caesar Augustus ; 8 November , 30 AD – 27 January , 98 AD ) was Roman Emperor from 96 to 98 . Nerva became Emperor at the age of sixty @-@ five , after a lifetime of imperial service under Nero and the rulers of the Flavian dynasty . Under Nero , he was a member of the imperial entourage and played a vital part in exposing the Pisonian conspiracy of 65 . Later , as a loyalist to the Flavians , he attained consulships in 71 and 90 during the reigns of Vespasian and Domitian respectively . 
- On 18 September 96 , Domitian was assassinated in a palace conspiracy involving members of the Praetorian Guard and several of his freedmen . On the same day , Nerva was declared emperor by the Roman Senate . This was the first time the Senate elected a Roman Emperor . As the new ruler of the Roman Empire , he vowed to restore liberties which had been curtailed during the autocratic government of Domitian . 
- Nerva 's brief reign was marred by financial difficulties and his inability to assert his authority over the Roman army . A revolt by the Praetorian Guard in October 97 essentially forced him to adopt an heir . After some deliberation Nerva adopted Trajan , a young and popular general , as his successor . After barely fifteen months in office , Nerva died of natural causes on 27 January 98 . Upon his death he was succeeded and deified by Trajan . 
- Although much of his life remains obscure , Nerva was considered a wise and moderate emperor by ancient historians . Nerva 's greatest success was his ability to ensure a peaceful transition of power after his death , thus founding the Nerva – Antonine dynasty . 
- 
- = = Early career = = 
- 
- 
- = = = Family = = = 
- 
- Marcus Cocceius Nerva was born in the village of Narni , 50 kilometers north of Rome , to the family of Marcus Cocceius Nerva , Suffect Consul in 40 , and Sergia Plautilla . Ancient sources report the date as either 30 or 35 . He had at least one attested sister , named Cocceia , who married Lucius Salvius Titianus Otho , the brother of the future Emperor Otho . 
- Like Vespasian , the founder of the Flavian dynasty , Nerva was a member of the Italian nobility rather than one of the elite of Rome . Nevertheless , the Cocceii were among the most esteemed and prominent political families of the late Republic and early Empire , attaining consulships in each successive generation . The direct ancestors of Nerva on his father 's side , all named Marcus Cocceius Nerva , were associated with imperial circles since the time of Emperor Augustus ( 27 BC – AD 14 ) . 
- His great @-@ grandfather was Consul in 36 BC ( in replacement , and abdicated ) , and Governor of Asia in the same year . His grandfather became Consul Suffect in July of either 21 or 22 , and was known as a personal friend of Emperor Tiberius ( AD 14 – 37 ) , accompanying the emperor during his voluntary seclusion on Capri from 23 onwards , dying in 33 . Nerva 's father , finally , attained the consulship in 40 under emperor Caligula ( 37 – 41 ) . The Cocceii were connected with the Julio @-@ Claudian dynasty through the marriage of Sergia Plautilla 's brother Octavius Laenas , and Rubellia Bassa , the great @-@ granddaughter of Tiberius . 
- 
- = = = Imperial service = = = 
- 
- Not much of Nerva 's early life or career is recorded , but it appears he did not pursue the usual administrative or military career . He was praetor @-@ elect in the year 65 and , like his ancestors , moved in imperial circles as a skilled diplomat and strategist . As an advisor to Emperor Nero , he successfully helped detect and expose the Pisonian conspiracy of 65 . Exactly what his contribution to the investigation was is not known but his services must have been considerable , since they earned him rewards equal to those of Nero 's guard prefect Tigellinus . He received triumphal honors — which was usually reserved for military victories — and the right to have his statues placed throughout the palace . 
- According to the contemporary poet Martial , Nero also held Nerva 's literary abilities in high esteem , hailing him as the " Tibullus of our time " . Another prominent member of Nero 's entourage was Vespasian , an old and respected general who had celebrated military triumphs during the 40s . It appears Vespasian befriended Nerva during his time as an imperial advisor , and may have asked him to watch over Vespasian 's youngest son Domitian when Vespasian departed for the Jewish war in 67 . 
- The suicide of Nero on 9 June 68 brought the Julio @-@ Claudian dynasty to an end , leading to the chaotic Year of the Four Emperors , which saw the successive rise and fall of the emperors Galba , Otho and Vitellius , until the accession of Vespasian on 21 December 69 . Virtually nothing is known of Nerva 's whereabouts during 69 , but despite the fact that Otho was his brother @-@ in @-@ law , he appears to have been one of the earliest and strongest supporters of the Flavians . 
- For services unknown , he was rewarded with a consulship early in Vespasian 's reign in 71 . This was a remarkable honour , not only because he held this office early under the new regime , but also because it was an ordinary consulship ( instead of a less prestigious suffect consulship ) , making him one of the few non @-@ Flavians to be honoured in this way under Vespasian . After 71 Nerva again disappears from historical record , presumably continuing his career as an inconspicuous advisor under Vespasian ( 69 – 79 ) and his sons Titus ( 79 – 81 ) and Domitian ( 81 – 96 ) . 
- He re @-@ emerges during the revolt of Saturninus in 89 . On 1 January , 89 , the governor of Germania Superior , Lucius Antonius Saturninus , and his two legions at Mainz , Legio XIV Gemina and Legio XXI Rapax , revolted against the Roman Empire with the aid of a tribe of the Chatti . The governor of Germania Inferior , Lappius Maximus , moved to the region at once , assisted by the procurator of Rhaetia , Titus Flavius Norbanus . Within twenty @-@ four days the rebellion was crushed , and its leaders at Mainz savagely punished . The mutinous legions were sent to the front of Illyricum , while those who had assisted in their defeat were duly rewarded . 
- Domitian opened the year following the revolt by sharing the consulship with Nerva . Again , the honour suggested Nerva had played a part in uncovering the conspiracy , perhaps in a fashion similar to what he did during the Pisonian conspiracy under Nero . Alternatively , Domitian may have selected Nerva as his colleague to emphasise the stability and status @-@ quo of the regime . The revolt had been suppressed , and the Empire could return to order . 
- 
- = = Emperor = = 
- 
- 
- = = = Accession = = = 
- 
- On 18 September , 96 , Domitian was assassinated in a palace conspiracy organised by court officials . The Fasti Ostienses , the Ostian Calendar , records that the same day the Senate proclaimed Marcus Cocceius Nerva emperor . Despite his political experience , this was a remarkable choice . Nerva was old and childless , and had spent much of his career out of the public light , prompting both ancient and modern authors to speculate on his involvement in Domitian 's assassination . 
- According to Cassius Dio , the conspirators approached Nerva as a potential successor prior to the assassination , which indicates that he was at least aware of the plot . Suetonius by contrast does not mention Nerva , but he may have omitted his role out of tactfulness . Considering the works of Suetonius were published under Nerva 's direct descendants Trajan and Hadrian , it would have been less than sensitive of him to suggest the dynasty owed its accession to murder . On the other hand , Nerva lacked widespread support in the Empire , and as a known Flavian loyalist his track record would not have recommended him to the conspirators . The precise facts have been obscured by history , but modern historians believe Nerva was proclaimed Emperor solely on the initiative of the Senate , within hours after the news of the assassination broke . 
- Although he appeared to be an unlikely candidate on account of his age and weak health , Nerva was considered a safe choice precisely because he was old and childless . Furthermore , he had close connections with the Flavian dynasty and commanded the respect of a substantial part of the Senate . Nerva had seen the anarchy which had resulted from the death of Nero ; he knew that to hesitate even for a few hours could lead to violent civil conflict . Rather than decline the invitation and risk revolts , he accepted . The decision may have been hasty so as to avoid civil war , but neither the Senate nor Nerva appears to have been involved in the conspiracy against Domitian . 
- Following the accession of Nerva as emperor , the Senate passed damnatio memoriae on Domitian : his coins and statues were melted , his arches were torn down and his name was erased from all public records . In many instances , existing portraits of Domitian , such as those found on the Cancelleria Reliefs , were simply recarved to fit the likeness of Nerva . This allowed quick production of new images and recycling of previous material . In addition , the vast palace which Domitian had erected on the Palatine Hill , known as the Flavian Palace , was renamed the " House of the People " , and Nerva himself took up residence in Vespasian 's former villa in the Gardens of Sallust . 
- 
- = = = Administration = = = 
- 
- The change of government was welcome particularly to the senators , who had been harshly persecuted during Domitian 's reign . As an immediate gesture of goodwill towards his supporters , Nerva publicly swore that no senators would be put to death as long as he remained in office . He called an end to trials based on treason , released those who had been imprisoned under these charges , and granted amnesty to many who had been exiled . 
- All properties which had been confiscated by Domitian were returned to their respective families . Nerva also sought to involve the Senate in his government , but this was not entirely successful . He continued to rely largely on friends and advisors that were known and trusted , and by maintaining friendly relations with the pro @-@ Domitianic faction of the Senate , he incurred hostility which may have been the cause for at least one conspiracy against his life . 
- Having been proclaimed emperor solely on the initiative of the Senate , Nerva had to introduce a number of measures to gain support among the Roman populace . As was custom by this time , a change of emperor was expected to bring with it a generous payment of gifts and money to the people and the army . Accordingly , a congiarium of 75 denarii per head was bestowed upon the citizens , while the soldiers of the Praetorian Guard received a donativum which may have amounted to as much as 5000 denarii per person . This was followed by a string of economic reforms intended to alleviate the burden of taxation from the most needy Romans . 
- To the poorest , Nerva granted allotments of land worth up to 60 million sesterces . He exempted parents and their children from a 5 % inheritance tax , and he made loans to Italian landowners on the condition that they pay interest of 5 % to their municipality to support the children of needy families ; alimentary schemes which were later expanded by Trajan , Antoninus Pius , and Marcus Aurelius . Furthermore , numerous taxes were remitted and privileges granted to Roman provinces . Namely , he probably abolished the Fiscus Iudaicus , the additional tax which all Jews throughout the Empire had to pay : some of his coins bear the legend FISCI IUDAICI CALUMNIA SUBLATA ( abolition of malicious prosecution regarding the Jewish tax ) . 
- Before long , Nerva 's expenses strained the economy of Rome and , although perhaps not ruinous to the extent once suggested by Syme , necessitated the formation of a special commission of economy to drastically reduce expenditures . The most superfluous religious sacrifices , games and horse races were abolished , while new income was generated from Domitian 's former possessions , including the auctioning of ships , estates , and even furniture . Large amounts of money were obtained from Domitian 's silver and gold statues , and Nerva forbade that similar images be made in his honor . 
- Because he reigned only briefly , Nerva 's public works were few , instead completing projects which had been initiated under Flavian rule . This included extensive repairs to the Roman road system and the expansion of the aqueducts . The latter program was headed by the former consul Sextus Julius Frontinus , who helped to put an end to abuses and later published a significant work on Rome 's water supply , De Aquis Urbis Romae . The only major landmarks constructed under Nerva were a granary , known as the Horrea Nervae , and a small Imperial Forum begun by Domitian , which linked the Forum of Augustus to the Temple of Peace . Little remains , partly because the Via dei Fori Imperiali cuts across it . 
- 
- = = = Crisis of succession = = = 
- 
- Despite Nerva 's measures to remain popular with the Senate and the Roman people , support for Domitian remained strong in the army , which had called for his deification immediately after the assassination . In an attempt to appease the soldiers of the Praetorian Guard , Nerva had dismissed their prefect Titus Petronius Secundus — one of the chief conspirators against Domitian — and replaced him with a former commander , Casperius Aelianus . 
- Likewise , the generous donativum bestowed upon the soldiers following his accession was expected to swiftly silence any protests against the violent regime change . The Praetorians considered these measures insufficient , however , and demanded the execution of Domitian 's assassins , which Nerva refused . Continued dissatisfaction with this state of affairs would ultimately lead to the gravest crisis of Nerva 's reign . 
- While the swift transfer of power following Domitian 's death had prevented a civil war from erupting , Nerva 's position as an emperor soon proved too vulnerable , and his benign nature turned into a reluctance to assert his authority . Upon his accession , he had ordered a halt to treason trials , but at the same time allowed the prosecution of informers by the Senate to continue . This measure led to chaos , as everyone acted in his own interests while trying to settle scores with personal enemies , leading the consul Fronto to famously remark that Domitian 's tyranny was ultimately preferable to Nerva 's anarchy . Early in 97 , a conspiracy led by the senator Gaius Calpurnius Piso Crassus Frugi Licinianus failed , but once again Nerva refused to put the conspirators to death , much to the disapproval of the Senate . 
- The situation was further aggravated by the absence of a clear successor , made more pressing because of Nerva 's old age and sickness . He had no natural children of his own and only distant relatives , who were unsuited for political office . A successor would have to be chosen from among the governors or generals in the Empire and it appears that , by 97 , Nerva was considering to adopt Marcus Cornelius Nigrinus Curiatius Maternus , the powerful governor of Syria . This was covertly opposed by those who supported the more popular military commander Marcus Ulpius Traianus , commonly known as Trajan , a general of the armies at the German frontier . 
- In October 97 these tensions came to a head when the Praetorian Guard , led by Casperius Aelianus , laid siege to the Imperial Palace and took Nerva hostage . He was forced to submit to their demands , agreeing to hand over those responsible for Domitian 's death and even giving a speech thanking the rebellious Praetorians . Titus Petronius Secundus and Parthenius , Domitian 's former chamberlain , were sought out and killed . Nerva was unharmed in this assault , but his authority was damaged beyond repair . 
- He realized that his position was no longer tenable without the support of an heir who had the approval of both the army and the people . Shortly thereafter , he announced the adoption of Trajan as his successor , and with this decision all but abdicated . Trajan was formally bestowed with the title of Caesar and shared the consulship with Nerva in 98 : 
- Contrary to the view here popularized by Cassius Dio , however , Nerva had in fact little choice with regard to his successor . Faced with a major crisis , he desperately needed the support of a man who could restore his damaged reputation . The only candidate with sufficient military experience , consular ancestry , and connections was Trajan . Likewise , Edward Gibbon 's assertion that Nerva hereby established a tradition of succession through adoption among the Five Good Emperors has found little support among modern historians . 
- 
- = = Death and legacy = = 
- 
- On 1 January , 98 , at the start of his fourth consulship , Nerva suffered a stroke during a private audience . Shortly thereafter he was struck by a fever and died at his villa in the Gardens of Sallust , on 28 January . He was deified by the Senate , and his ashes were laid to rest in the Mausoleum of Augustus . 
- Nerva was succeeded without incident by his adopted son Trajan , who was greeted by the Roman populace with much enthusiasm . According to Pliny the Younger , Trajan dedicated a temple in honour of Nerva , yet no trace of it has ever been found ; nor was a commemorative series of coins for the Deified Nerva issued until ten years after his death . According to Cassius Dio , however , the Guard prefect responsible for the mutiny against Nerva , Casperius Aelianus , was ' dismissed ' upon Trajan 's accession . 
- Due to the lack of written sources on this period , much of Nerva 's life has remained obscure . The most substantial surviving account of the reign of Nerva was written by the 3rd @-@ century historian Cassius Dio . His Roman History , which spans nearly a millennium , from the arrival of Aeneas in Italy until the year 229 , was composed more than one hundred years after Nerva had died . Further details are added by an abridged biography from the Epitome de Caesaribus , a work alleged to have been authored by the 4th @-@ century historian Aurelius Victor . 
- A more comprehensive text , presumed to describe the life of Nerva in closer detail , is the Histories , by the contemporary historian Tacitus . The Histories is an account of the history of Rome covering three decades from the suicide of emperor Nero in 69 until the death of Domitian in 96 . Unfortunately , a substantial part of the work has been lost , with only the first five books covering the Year of the Four Emperors remaining . In the introduction to his biography of Gnaeus Julius Agricola however , Tacitus speaks highly of Nerva , describing his reign as " the dawn of a most happy age , [ when ] Nerva Caesar blended things once irreconcilable , sovereignty and freedom " . 
- The surviving histories speak equally positively of Nerva 's brief reign , although none offer a substantial commentary on his policies . Both Cassius Dio and Aurelius Victor emphasize his wisdom and moderation , with Dio commending his decision to adopt Trajan as his heir . These views were later popularized by the 18th @-@ century historian Edward Gibbon in his History of the Decline and Fall of the Roman Empire . Gibbon considered Nerva the first of the Five Good Emperors , five successive rulers under whom the Roman Empire " was governed by absolute power , under the guidance of wisdom and virtue " from 96 until 180 . Nevertheless , even Gibbon notes that , compared to his successors , Nerva may have lacked the necessary qualifications for a successful reign : 
- Modern history has expanded upon this sentiment , characterizing Nerva as a well @-@ intentioned but weak and ineffectual ruler . The Roman Senate enjoyed renewed liberties under his rule , but Nerva 's mismanagement of the state finances and lack of authority over the army ultimately brought Rome near the edge of a significant crisis . The mutiny led by Casperius Aelianus was never intended as a coup , but a calculated attempt to put pressure on the emperor . The adoption of Trajan expanded his power base with a respected , reliable general as his successor . Murison concludes that Nerva 's real talents were in fact ill @-@ suited to the emperorship : 
- His place in Roman history is therefore summarized as a necessary , if tumultuous stop @-@ gap before the Trajanic @-@ Antonine dynasties . It is a fact of irony that even the only major public work completed during his reign , the Forum of Nerva , ultimately became known as the Forum Transitorium , or transitional forum . 
- Two modern statues which commemorate Nerva can be found in towns associated with him . There is an equestrian statue in Gloucester , England , a town which was founded in his honour . It is at the entrance to Southgate Street . There is also a statue at his alleged birthplace , Narni in Italy , at Cocceio Nerva street . 
- 
- = = Nerva – Antonine family tree = = 
- 
- 
- = = In popular culture = = 
- 
- Nerva was played by Norman Wooland in the 1951 film Quo Vadis . 
- He was also played by Giuliano Gemma in the 1964 film Revolt of the Praetorians . 
- 
- = = = Secondary material = = = 
- 
- Narnia web links , International links ' , International links from Narnia.it web site 
- Wend , David ( 1998 ) . " Nerva ( 96 – 98 A.D. ) " . De Imperatoribus Romanis . Retrieved 2007 @-@ 08 @-@ 11 . 
- Pelham , Henry Francis ( 1911 ) . " Nerva , Marcus Cocceius " . In Chisholm , Hugh . Encyclopædia Britannica 19 ( 11th ed . ) . Cambridge University Press. pp. 393 – 394 . 
- 
- = The Hustler ( film ) = 
- 
- The Hustler is a 1961 American drama film directed by Robert Rossen from Walter Tevis 's 1959 novel of the same name , adapted for the screen by Rossen and Sidney Carroll . It tells the story of small @-@ time pool hustler " Fast Eddie " Felson and his desire to break into the " major league " of professional hustling and high @-@ stakes wagering by high @-@ rollers that follows it . He throws his raw talent and ambition up against the best player in the country ; seeking to best the legendary pool player " Minnesota Fats . " After initially losing to Fats and getting involved with unscrupulous manager Bert Gordon , Eddie returns to try again , but only after paying a terrible personal price . 
- The film was shot on location in New York City . It stars Paul Newman as " Fast " Eddie Felson , Jackie Gleason as Minnesota Fats , Piper Laurie as Sarah , and George C. Scott as Bert . 
- The Hustler was a major critical and popular success , gaining a reputation as a modern classic . Its exploration of winning , losing , and character garnered a number of major awards ; it is also credited with helping to spark a resurgence in the popularity of pool . Real @-@ life pool player Rudolf Wanderone , known at the time as " New York Fats " and " Chicago Fats " , claimed to be the real life inspiration for Gleason 's character , Minnesota Fats , and adopted the name as his own . 
- 
- = = Plot = = 
- 
- Small @-@ time pool hustler " Fast Eddie " Felson travels cross @-@ country with his partner Charlie to challenge the legendary player " Minnesota Fats " . Arriving at Fats ' home pool hall , Eddie declares he will win $ 10 @,@ 000 that night . Fats arrives and he and Eddie agree to play straight pool for $ 200 a game . After initially falling behind , Eddie surges back to being $ 1 @,@ 000 ahead and suggests raising the bet to $ 1 @,@ 000 a game ; Fats agrees . He sends out a runner , Preacher , to Johnny 's Bar , ostensibly for whiskey , but really to get professional gambler Bert Gordon to the hall . Eddie gets ahead $ 11 @,@ 000 and Charlie tries to convince him to quit , but Eddie insists the game will end only when Fats says it is over . Fats agrees to continue after Bert labels Eddie a " loser . " After 25 hours and an entire bottle of bourbon , Eddie is ahead over $ 18 @,@ 000 , but loses it all along with all but $ 200 of his original stake . At their hotel later , Eddie leaves half of the remaining stake with a sleeping Charlie and leaves . 
- Eddie stashes his belongings at the local bus terminal , where he meets Sarah Packard , an alcoholic who is supported by her father , attends college part @-@ time , and walks with a limp . He meets her again at a bar . They go back to her place but she refuses to let him in , saying he is " too hungry " . Eddie moves into a rooming house and starts hustling for small stakes . He finds Sarah again and this time she takes him in , but with reservations . Charlie finds Eddie at Sarah 's and tries to persuade him to go back out on the road . Eddie refuses and Charlie realizes he plans to challenge Fats again . Eddie realizes that Charlie held out his percentage and becomes enraged , believing that with that money he could have rebounded to beat Fats . Eddie dismisses Charlie as a scared old man and tells him to " go lie down and die " by himself . 
- At Johnny 's Bar , Eddie joins a poker game where Bert is playing , and loses $ 20 . Afterward , Bert tells Eddie that he has talent as a pool player but no character . He figures that Eddie will need at least $ 3 @,@ 000 to challenge Fats again . Bert calls him a " born loser " but nevertheless offers to stake him in return for 75 % of his winnings ; Eddie refuses . 
- Eddie humiliates a local pool shark , exposing himself as a hustler , and the other players punish him by breaking his thumbs . As he heals , Sarah cares for him and tells him she loves him , but he cannot say the words in return . When Eddie is ready to play , he agrees to Bert 's terms , deciding that a " 25 % slice of something big is better than a 100 % slice of nothing " . 
- Bert , Eddie , and Sarah travel to the Kentucky Derby , where Bert arranges a match for Eddie against a wealthy local socialite named Findley . The game turns out to be carom billiards , not pool . When Eddie loses badly , Bert refuses to keep staking him . Sarah pleads with Eddie to leave with her , saying that the world he is living in and its inhabitants are " perverted , twisted , and crippled " ; he refuses . Seeing Eddie 's anger , Bert agrees to let the match continue at $ 1 @,@ 000 a game . Eddie comes back to win $ 12 @,@ 000 . He collects his $ 3 @,@ 000 share and decides to walk back to the hotel . Bert arrives first and subjects Sarah to a humiliating sexual encounter . After , she scrawls " PERVERTED " , " TWISTED " , and " CRIPPLED " in lipstick on the bathroom mirror . Eddie arrives back at the hotel to learn that she has killed herself . 
- Eddie returns to challenge Fats again , putting up his entire $ 3 @,@ 000 stake on a single game . He wins game after game , beating Fats so badly that Fats is forced to quit . Bert demands a share of Eddie 's winnings and threatens that Eddie will be injured unless he pays . But Eddie says that if he is not killed he will kill Bert when he recovers ; invoking the memory of Sarah , he shames Bert into giving up his claim . Instead , Bert orders Eddie never to walk into a big @-@ time pool hall again . Eddie and Fats compliment each other as players , and Eddie walks out . 
- 
- = = Cast = = 
- 
- Cast notes 
- Pool champion Willie Mosconi has a cameo appearance as Willie , who holds the stakes for Eddie and Fats 's games . Mosconi 's hands also appear in many of the closeup shots . 
- 
- = = Production = = 
- 
- The Tevis novel had been optioned several times , including by Frank Sinatra , but attempts to adapt it for the screen were unsuccessful . Director Rossen 's daughter Carol Rossen speculates that previous adaptations focused too much on the pool aspects of the story and not enough on the human interaction . Rossen , who had hustled pool himself as a youth and who had made an abortive attempt to write a pool @-@ themed play called Corner Pocket , optioned the book and teamed with Sidney Carroll to produce the script . 
- According to Bobby Darin 's agent , Martin Baum , Paul Newman 's agent turned down the part of Fast Eddie . Newman was originally unavailable to play Fast Eddie regardless , being committed to star opposite Elizabeth Taylor in the film Two for the Seesaw . Rossen offered Darin the part after seeing him on The Mike Wallace Interview . When Taylor was forced to drop out of Seesaw because of shooting overruns on Cleopatra , Newman was freed up to take the role , which he accepted after reading just half of the script . No one associated with the production officially notified Darin or his representatives that he had been replaced ; they found out from a member of the public at a charity horse race . 
- Rossen filmed The Hustler over six weeks , entirely in New York City . Much of the action was filmed at two now @-@ defunct pool halls , McGirr 's and Ames Billiard Academy . Other shooting locations included a townhouse on East 82nd Street , which served as the Louisville home of Murray Hamilton 's character Findley , and the Manhattan Greyhound bus terminal . The film crew built a dining area that was so realistic that confused passengers sat there and waited to place their orders . Willie Mosconi served as technical advisor on the film and shot a number of the trick shots in place of the actors . All of Gleason 's shots were his own ; they were filmed in wide @-@ angle to emphasize having the actor and the shot in the same frames . Rossen , in pursuit of the style he termed " neo @-@ neo @-@ realistic " , hired actual street thugs , enrolled them in the Screen Actors Guild and used them as extras . Scenes that were included in the shooting script but did not make it into the final film include a scene at Ames pool hall establishing that Eddie is on his way to town ( originally slated to be the first scene of the film ) and a longer scene of Preacher talking to Bert at Johnny 's Bar which establishes Preacher is a junkie . 
- Early shooting put more focus on the pool playing , but during filming Rossen made the decision to place more emphasis on the love story between Newman and Laurie 's characters . Despite the change in emphasis , Rossen still used the various pool games to show the strengthening of Eddie 's character and the evolution of his relationship to Bert and Sarah , through the positioning of the characters in the frame . For example , when Eddie is playing Findley , Eddie is positioned below Bert in a two shot but above Findley while still below Bert in a three shot . When Sarah enters the room , she is below Eddie in two shot while in a three shot Eddie is still below Bert . When Eddie is kneeling over Sarah 's body , Bert again appears above him but Eddie attacks Bert , ending up on top of him . Eddie finally appears above Bert in two shot when Eddie returns to beat Fats . 
- 
- = = Themes = = 
- 
- The Hustler is fundamentally a story of what it means to be a human being , couched within the context of winning and losing . Describing the film , Robert Rossen said : " My protagonist , Fast Eddie , wants to become a great pool player , but the film is really about the obstacles he encounters in attempting to fulfill himself as a human being . He attains self @-@ awareness only after a terrible personal tragedy which he has caused — and then he wins his pool game . " Roger Ebert concurs with this assessment , citing The Hustler as " one of the few American movies in which the hero wins by surrendering , by accepting reality instead of his dreams . " 
- The film was also somewhat autobiographical for Rossen , relating to his dealings with the House Un @-@ American Activities Committee . A screenwriter during the 1930s and ' 40s , he had been involved with the Communist Party in the 1930s and refused to name names at his first HUAC appearance . Ultimately he changed his mind and identified friends and colleagues as party members . Similarly , Felson sells his soul and betrays the one person who really knows and loves him in a Faustian pact to gain character . 
- Film and theatre historian Ethan Mordden has identified The Hustler as one of a handful of films from the early 1960s that re @-@ defined the relationship of films to their audiences . This new relationship , he writes , is " one of challenge rather than flattery , of doubt rather than certainty . " No film of the 1950s , Mordden asserts , " took such a brutal , clear look at the ego @-@ affirmation of the one @-@ on @-@ one contest , at the inhumanity of the winner or the castrated vulnerability of the loser . " Although some have suggested the resemblance of this film to classic film noir , Mordden rejects the comparison based on Rossen 's ultra @-@ realistic style , also noting that the film lacks noir 's " Treacherous Woman or its relish in discovering crime among the bourgeoisie , hungry bank clerks and lusty wives . " Mordden does note that while Fast Eddie " has a slight fifties ring " , the character " makes a decisive break with the extraordinarily feeling tough guys of the ' rebel ' era ... [ b ] ut he does end up seeking out his emotions " and telling Bert that he is a loser because he 's dead inside . 
- 
- = = Reception = = 
- 
- The Hustler had its world premiere in Washington , D.C. on September 25 , 1961 . Prior to the premiere , Richard Burton hosted a midnight screening of the film for the casts of the season 's Broadway shows , which generated a great deal of positive word of mouth . Initially reluctant to publicize the film , 20th Century Fox responded by stepping up its promotional activities . 
- The film was well received by critics , although with the occasional caveat . Variety praised the performances of the entire main cast but felt that the " sordid aspects " of the story prevented the film from achieving the " goal of being pure entertainment . " Variety also felt the film was far too long . Stanley Kauffmann , writing for The New Republic , concurred in part with this assessment . Kauffmann strongly praised the principal cast , calling Newman " first @-@ rate " and writing that Scott 's was " his most credible performance to date . " Laurie , he writes , gives her part " movingly anguished touches " ( although he also mildly criticizes her for over @-@ reliance on Method acting ) . While he found that the script " strains hard to give an air of menace and criminality to the pool hall " and also declares it " full of pseudo @-@ meaning " , Kauffmann lauds Rossen 's " sure , economical " direction , especially in regard to Gleason who , he says , does not so much act as " [ pose ] for a number of pictures which are well arranged by Rossen . It is the best use of a manikin by a director since Kazan photographed Burl Ives as Big Daddy . " The New York Times , despite finding that the film " strays a bit " and that the romance between Newman and Laurie 's characters " seems a mite far @-@ fetched " , nonetheless found that The Hustler " speaks powerfully in a universal language that spellbinds and reveals bitter truths . " 
- The Hustler received nine Academy Award nominations . The film won two , for Best Art Direction @-@ Set Decoration , Black @-@ and @-@ White ( Harry Horner and Gene Callahan ) and Best Cinematography , Black @-@ and @-@ White ( Eugen Schüfftan ) . The film was also nominated for Best Picture and Newman was nominated for Best Actor in a Leading Role . Gleason and Scott were both nominated for Best Actor in a Supporting Role ; Scott refused the nomination . Laurie was nominated for Best Actress in a Leading Role . Rossen received nominations for Best Director and , with Carroll , for Best Writing , Screenplay Based on Material from Another Medium . 
- Newman was nominated for a Golden Globe Award for Best Actor . Gleason and Scott were each nominated for Best Supporting Actor and Scott was also nominated as Best New Star of the Year . At the 1962 BAFTA Awards , The Hustler tied with the Soviet film Ballad of a Soldier for Best Film from Any Source . Newman won for Best Foreign Actor and Piper Laurie was nominated for Best Foreign Actress . Gleason was honored as Best Supporting Actor by the National Board of Review of Motion Pictures and the film was named among the Board 's ten best films of 1961 . Rossen was named Best Director by the New York Film Critics Circle Awards and Rossen and Carroll shared the Writers Guild of America Award for Best Written Drama . 
- American Film Institute Lists 
- AFI 's 100 Years ... 100 Movies - Nominated 
- AFI 's 100 Years ... 100 Thrills - Nominated 
- AFI 's 100 Years ... 100 Heroes and Villains : 
- Bert Gordon - Nominated Villain 
- AFI 's 100 Years ... 100 Movie Quotes : 
- " Eddie , you 're a born loser . " - Nominated 
- AFI 's 100 Years ... 100 Movies ( 10th Anniversary Edition ) - Nominated 
- AFI 's 10 Top 10 - # 6 Sports Film 
- 
- = = Legacy = = 
- 
- In the decades since its release , The Hustler has cemented its reputation as a classic . Roger Ebert , echoing earlier praise for the performances , direction , and cinematography and adding laurels for editor Dede Allen , cites the film as " one of those films where scenes have such psychic weight that they grow in our memories . " He further cites Fast Eddie Felson as one of " only a handful of movie characters so real that the audience refers to them as touchstones . " TV Guide calls the film a " dark stunner " offering " a grim world whose only bright spot is the top of the pool table , yet [ with ] characters [ who ] maintain a shabby nobility and grace . " The four leads are again lavishly praised for their performances and the film is summed up as " not to be missed . " 
- Paul Newman reprised his role as Fast Eddie Felson in the 1986 film The Color of Money , for which he won the Academy Award for Best Actor in a Leading Role . A number of observers and critics have suggested that this Oscar was in belated recognition for his performance in The Hustler . In 1997 , the Library of Congress selected The Hustler for preservation in the United States National Film Registry as " culturally , historically , or aesthetically significant . " Carroll and Rossen 's screenplay was selected by the Writers Guild of America in 2006 as the 96th best motion picture screenplay of all time . In June 2008 , AFI released its " Ten top Ten " — the best ten films in ten " classic " American film genres — after polling over 1 @,@ 500 people from the creative community . The Hustler was acknowledged as the sixth best film in the sports genre . 
- The Hustler is credited with sparking a resurgence in the popularity of pool in the United States , which had been on the decline for decades . The film also brought recognition to Willie Mosconi , who , despite having won multiple world championships , was virtually unknown to the general public . Perhaps the greatest beneficiary of the film 's popularity was a real @-@ life pool hustler named Rudolf Wanderone . Mosconi claimed in an interview at the time of the film 's release that the character of Minnesota Fats was based on Wanderone , who at the time was known as " New York Fatty " . Wanderone immediately adopted the Minnesota Fats nickname and parlayed his association with the film into book and television deals and other ventures . Author Walter Tevis denied for the rest of his life that Wanderone had played any role in the creation of the character . Other players would claim , with greater or lesser degrees of credibility , to have served as models for Fast Eddie , including Ronnie Allen , Ed Taylor , Ed Parker , and Eddie Pelkey . 
- 
diff --git a/server/transformers/.circleci/config.yml b/server/transformers/.circleci/config.yml
deleted file mode 100644
index b4d27d2d63563390c6716c6f1a90f77aa7adf3e5..0000000000000000000000000000000000000000
--- a/server/transformers/.circleci/config.yml
+++ /dev/null
@@ -1,143 +0,0 @@
-version: 2
-jobs:
-    run_tests_torch_and_tf:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            OMP_NUM_THREADS: 1
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[sklearn,tf,torch,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
-    run_all_tests_torch_and_tf:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            OMP_NUM_THREADS: 1
-            RUN_SLOW: yes
-            RUN_CUSTOM_TOKENIZERS: yes
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[mecab,sklearn,tf,torch,testing]
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
-    run_tests_torch:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.7
-        environment:
-            OMP_NUM_THREADS: 1
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
-    run_tests_tf:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.7
-        environment:
-            OMP_NUM_THREADS: 1
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[sklearn,tf,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
-    run_tests_custom_tokenizers:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            RUN_CUSTOM_TOKENIZERS: yes
-        steps:
-            - checkout
-            - run: sudo pip install .[mecab,testing]
-            - run: python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
-    run_examples_torch:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            OMP_NUM_THREADS: 1
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
-            - run: sudo pip install -r examples/requirements.txt
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
-    deploy_doc:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        steps:
-            - add_ssh_keys:
-                fingerprints:
-                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
-            - checkout
-            - run: sudo pip install .[tf,torch,docs]
-            - run: ./.circleci/deploy.sh
-    check_code_quality:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.6
-        resource_class: medium
-        parallelism: 1
-        steps:
-            - checkout
-            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
-            - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-            - run: sudo pip install .[tf,torch,quality]
-            - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
-            - run: isort --check-only --recursive examples templates tests src utils
-            - run: flake8 examples templates tests src utils
-    check_repository_consistency:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        resource_class: small
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install requests
-            - run: python ./utils/link_tester.py
-workflow_filters: &workflow_filters
-    filters:
-        branches:
-            only:
-                - master
-workflows:
-    version: 2
-    build_and_test:
-        jobs:
-            - check_code_quality
-            - check_repository_consistency
-            - run_examples_torch
-            - run_tests_custom_tokenizers
-            - run_tests_torch_and_tf
-            - run_tests_torch
-            - run_tests_tf
-            - deploy_doc: *workflow_filters
-    run_slow_tests:
-        triggers:
-            - schedule:
-                cron: "0 4 * * 1"
-                filters:
-                    branches:
-                        only:
-                            - master
-        jobs:
-            - run_all_tests_torch_and_tf
diff --git a/server/transformers/.circleci/deploy.sh b/server/transformers/.circleci/deploy.sh
deleted file mode 100755
index 74f2601a943fae9ebf52e78b547a3539bdadba78..0000000000000000000000000000000000000000
--- a/server/transformers/.circleci/deploy.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-cd docs
-
-function deploy_doc(){
-	echo "Creating doc at commit $1 and pushing to folder $2"
-	git checkout $1
-	if [ ! -z "$2" ]
-	then
-		if [ -d "$dir/$2" ]; then
-			echo "Directory" $2 "already exists"
-		else
-			echo "Pushing version" $2
-			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
-		fi
-	else
-		echo "Pushing master"
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
-	fi
-}
-
-deploy_doc "master"
-deploy_doc "b33a385" v1.0.0
-deploy_doc "fe02e45" v1.1.0
-deploy_doc "89fd345" v1.2.0
-deploy_doc "fc9faa8" v2.0.0
-deploy_doc "3ddce1d" v2.1.1
-deploy_doc "3616209" v2.2.0
-deploy_doc "d0f8b9a" v2.3.0
-deploy_doc "6664ea9" v2.4.0
\ No newline at end of file
diff --git a/server/transformers/.coveragerc b/server/transformers/.coveragerc
deleted file mode 100644
index 9a1103b8af3d012e8894408308f4b12dbcebf58e..0000000000000000000000000000000000000000
--- a/server/transformers/.coveragerc
+++ /dev/null
@@ -1,12 +0,0 @@
-[run]
-source=transformers
-omit =
-    # skip convertion scripts from testing for now
-    */convert_*
-    */__main__.py
-[report]
-exclude_lines =
-    pragma: no cover
-    raise
-    except
-    register_parameter
\ No newline at end of file
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md b/server/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md
deleted file mode 100644
index 9e1c20689e008c23b99cfa9eb967bb2c115d9472..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md
+++ /dev/null
@@ -1,22 +0,0 @@
----
-name: "\U0001F5A5 New benchmark"
-about: Benchmark a part of this library and share your results
-title: "[Benchmark]"
-labels: ''
-assignees: ''
-
----
-
-# 🖥 Benchmarking `transformers`
-
-## Benchmark
-
-Which part of `transformers` did you benchmark?
-
-## Set-up
-
-What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
-
-## Results
-
-Put your results here!
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md b/server/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md
deleted file mode 100644
index e91fc71635c2d156db257ad0a414dfd1ab47d9f6..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-name: "\U0001F31F New model addition"
-about: Submit a proposal/request to implement a new Transformer-based model
-title: ''
-labels: ''
-assignees: ''
-
----
-
-# 🌟 New model addition
-
-## Model description
-
-<!-- Important information -->
-
-## Open source status
-
-* [ ] the model implementation is available: (give details)
-* [ ] the model weights are available: (give details)
-* [ ] who are the authors: (mention them, if possible by @gh-username)
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/bug-report.md b/server/transformers/.github/ISSUE_TEMPLATE/bug-report.md
deleted file mode 100644
index cc03dd01fb1c298d3728baebee86fa38519448e0..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/bug-report.md
+++ /dev/null
@@ -1,52 +0,0 @@
----
-name: "\U0001F41B Bug Report"
-about: Submit a bug report to help us improve transformers
-title: ''
-labels: ''
-assignees: ''
-
----
-
-# 🐛 Bug
-
-## Information
-
-Model I am using (Bert, XLNet ...):
-
-Language I am using the model on (English, Chinese ...):
-
-The problem arises when using:
-* [ ] the official example scripts: (give details below)
-* [ ] my own modified scripts: (give details below)
-
-The tasks I am working on is:
-* [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details below)
-
-## To reproduce
-
-Steps to reproduce the behavior:
-
-1.
-2.
-3.
-
-<!-- If you have code snippets, error messages, stack traces please provide them here as well.
-     Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
-     Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
-
-## Expected behavior
-
-<!-- A clear and concise description of what you would expect to happen. -->
-
-## Environment info
-<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
-     Don't forget to fill out the missing fields in that output! -->
-     
-- `transformers` version:
-- Platform:
-- Python version:
-- PyTorch version (GPU?):
-- Tensorflow version (GPU?):
-- Using GPU in script?:
-- Using distributed or parallel set-up in script?:
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/feature-request.md b/server/transformers/.github/ISSUE_TEMPLATE/feature-request.md
deleted file mode 100644
index 0d5234af32699e294a2d03a74bd1c7d35e2ceead..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/feature-request.md
+++ /dev/null
@@ -1,25 +0,0 @@
----
-name: "\U0001F680 Feature request"
-about: Submit a proposal/request for a new transformers feature
-title: ''
-labels: ''
-assignees: ''
-
----
-
-# 🚀 Feature request
-
-<!-- A clear and concise description of the feature proposal.
-     Please provide a link to the paper and code in case they exist. -->
-
-## Motivation
-
-<!-- Please outline the motivation for the proposal. Is your feature request
-     related to a problem? e.g., I'm always frustrated when [...]. If this is related
-     to another GitHub issue, please link here too. -->
-
-## Your contribution
-
-<!-- Is there any way that you could help, e.g. by submitting a PR?
-     Make sure to read the CONTRIBUTING.MD readme:
-     https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/migration.md b/server/transformers/.github/ISSUE_TEMPLATE/migration.md
deleted file mode 100644
index 387d97d528d48c0157cc703795b50b47eb0b6d65..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/migration.md
+++ /dev/null
@@ -1,57 +0,0 @@
----
-name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
-about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
-title: ''
-labels: ''
-assignees: ''
-
----
-
-# 📚 Migration
-
-## Information
-
-<!-- Important information -->
-
-Model I am using (Bert, XLNet ...):
-
-Language I am using the model on (English, Chinese ...):
-
-The problem arises when using:
-* [ ] the official example scripts: (give details below)
-* [ ] my own modified scripts: (give details below)
-
-The tasks I am working on is:
-* [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details below)
-
-## Details
-
-<!-- A clear and concise description of the migration issue.
-    If you have code snippets, please provide it here as well.
-    Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
-    Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
-    -->
-
-## Environment info
-<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
-     Don't forget to fill out the missing fields in that output! -->
- 
-- `transformers` version:
-- Platform:
-- Python version:
-- PyTorch version (GPU?):
-- Tensorflow version (GPU?):
-- Using GPU in script?:
-- Using distributed or parallel set-up in script?:
-
-<!-- IMPORTANT: which version of the former library do you use? -->
-* `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
-
-
-## Checklist
-
-- [ ] I have read the migration guide in the readme.
- ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
-  [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
-- [ ] I checked if a related official extension example runs on my machine.
diff --git a/server/transformers/.github/ISSUE_TEMPLATE/question-help.md b/server/transformers/.github/ISSUE_TEMPLATE/question-help.md
deleted file mode 100644
index ad842f133b96835006d10cb3613e380789037793..0000000000000000000000000000000000000000
--- a/server/transformers/.github/ISSUE_TEMPLATE/question-help.md
+++ /dev/null
@@ -1,29 +0,0 @@
----
-name: "❓ Questions & Help"
-about: Post your general questions on Stack Overflow tagged huggingface-transformers
-title: ''
-labels: ''
-assignees: ''
-
----
-
-# ❓ Questions & Help
-
-<!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
-     new models and benchmarks, and migration questions. For all other questions,
-     we direct you to Stack Overflow (SO) where a whole community of PyTorch and
-     Tensorflow enthusiast can help you out. Make sure to tag your question with the
-     right deep learning framework as well as the huggingface-transformers tag: 
-     https://stackoverflow.com/questions/tagged/huggingface-transformers 
-     
-     If your question wasn't answered after a period of time on Stack Overflow, you
-     can always open a question on GitHub. You should then link to the SO question 
-     that you posted.
-     -->
-
-## Details
-<!-- Description of your issue -->
-
-<!-- You should first ask your question on SO, and only if
-     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on Stack Overflow**: 
\ No newline at end of file
diff --git a/server/transformers/.github/stale.yml b/server/transformers/.github/stale.yml
deleted file mode 100644
index d9f6563218bd0fe4f7b1a36170801bdf982f12c9..0000000000000000000000000000000000000000
--- a/server/transformers/.github/stale.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Number of days of inactivity before an issue becomes stale
-daysUntilStale: 60
-# Number of days of inactivity before a stale issue is closed
-daysUntilClose: 7
-# Issues with these labels will never be considered stale
-exemptLabels:
-  - pinned
-  - security
-# Label to use when marking an issue as stale
-staleLabel: wontfix
-# Comment to post when marking an issue as stale. Set to `false` to disable
-markComment: >
-  This issue has been automatically marked as stale because it has not had
-  recent activity. It will be closed if no further activity occurs. Thank you
-  for your contributions.
-# Comment to post when closing a stale issue. Set to `false` to disable
-closeComment: false
\ No newline at end of file
diff --git a/server/transformers/.gitignore b/server/transformers/.gitignore
deleted file mode 100644
index c789666707081a381fcfbb919d0027316b32e903..0000000000000000000000000000000000000000
--- a/server/transformers/.gitignore
+++ /dev/null
@@ -1,141 +0,0 @@
-# Initially taken from Github's Python gitignore file
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# vscode
-.vscode
-
-# Pycharm
-.idea
-
-# TF code
-tensorflow_code
-
-# Models
-models
-proc_data
-
-# examples
-runs
-examples/runs
-
-# data
-/data
-serialization_dir
-
-# emacs
-*.*~
-debug.env
diff --git a/server/transformers/.gitrepo b/server/transformers/.gitrepo
deleted file mode 100644
index a9a0a5fcbf7eab3c4e519e805e136dc4a113d59b..0000000000000000000000000000000000000000
--- a/server/transformers/.gitrepo
+++ /dev/null
@@ -1,12 +0,0 @@
-; DO NOT EDIT (unless you know what you are doing)
-;
-; This subdirectory is a git "subrepo", and this file is maintained by the
-; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
-;
-[subrepo]
-	remote = https://github.com/bhoov/transformers.git
-	branch = exbert-mods
-	commit = a0b899d114c1891dc685ce448077efab4a386348
-	parent = 8235ef04d0dca4d47c9106f70c0bd8681895fb8f
-	method = merge
-	cmdver = 0.4.1
diff --git a/server/transformers/CONTRIBUTING.md b/server/transformers/CONTRIBUTING.md
deleted file mode 100644
index 4c313dad9fdd39d787a71399bf512b93603feecb..0000000000000000000000000000000000000000
--- a/server/transformers/CONTRIBUTING.md
+++ /dev/null
@@ -1,258 +0,0 @@
-# How to contribute to transformers?
-
-Everyone is welcome to contribute, and we value everybody's contribution. Code
-is thus not the only way to help the community. Answering questions, helping
-others, reaching out and improving the documentations are immensely valuable to
-the community.
-
-It also helps us if you spread the word: reference the library from blog posts
-on the awesome projects it made possible, shout out on Twitter every time it has
-helped you, or simply star the repo to say "thank you".
-
-## You can contribute in so many ways!
-
-There are 4 ways you can contribute to transformers:
-* Fixing outstanding issues with the existing code;
-* Implementing new models;
-* Contributing to the examples or to the documentation;
-* Submitting issues related to bugs or desired new features.
-
-*All are equally valuable to the community.*
-
-## Submitting a new issue or feature request
-
-Do your best to follow these guidelines when submitting an issue or a feature
-request. It will make it easier for us to come back to you quickly and with good
-feedback.
-
-### Did you find a bug?
-
-The transformers are robust and reliable thanks to the users who notify us of
-the problems they encounter. So thank you for reporting an issue.
-
-First, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on Github under Issues).
-
-Did not find it? :( So we can act quickly on it, please follow these steps:
-
-* Include your **OS type and version**, the versions of **Python**, **PyTorch** and
-  **Tensorflow** when applicable;
-* A short, self-contained, code snippet that allows us to reproduce the bug in
-  less than 30s;
-* Provide the *full* traceback if an exception is raised.
-
-To get the OS and software versions automatically, you can run the following command:
-
-```bash
-python transformers-cli env
-```
-
-### Do you want to implement a new model?
-
-Awesome! Please provide the following information:
-
-* Short description of the model and link to the paper;
-* Link to the implementation if it is open-source;
-* Link to the model weights if they are available.
-
-If you are willing to contribute the model yourself, let us know so we can best
-guide you.
-
-We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
-
-### Do you want a new feature (that is not a model)?
-
-A world-class feature request addresses the following points:
-
-1. Motivation first:
-  * Is it related to a problem/frustration with the library? If so, please explain
-    why. Providing a code snippet that demonstrates the problem is best.
-  * Is it related to something you would need for a project? We'd love to hear
-    about it!
-  * Is it something you worked on and think could benefit the community?
-    Awesome! Tell us what problem it solved for you.
-2. Write a *full paragraph* describing the feature;
-3. Provide a **code snippet** that demonstrates its future use;
-4. In case this is related to a paper, please attach a link;
-5. Attach any additional information (drawings, screenshots, etc.) you think may help.
-
-If your issue is well written we're already 80% of the way there by the time you
-post it.
-
-We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
-
-## Start contributing! (Pull Requests)
-
-Before writing code, we strongly advise you to search through the exising PRs or
-issues to make sure that nobody is already working on the same thing. If you are
-unsure, it is always a good idea to open an issue to get some feedback.
-
-You will need basic `git` proficiency to be able to contribute to
-`transformers`. `git` is not the easiest tool to use but it has the greatest
-manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
-Git](https://git-scm.com/book/en/v2) is a very good reference.
-
-Follow these steps to start contributing:
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
-
-2. Clone your fork to your local disk, and add the base repository as a remote:
-
-   ```bash
-   $ git clone git@github.com:<your Github handle>/transformers.git
-   $ cd transformers
-   $ git remote add upstream https://github.com/huggingface/transformers.git
-   ```
-
-3. Create a new branch to hold your development changes:
-
-   ```bash
-   $ git checkout -b a-descriptive-name-for-my-changes
-   ```
-
-   **do not** work on the `master` branch.
-
-4. Set up a development environment by running the following command in a virtual environment:
-
-   ```bash
-   $ pip install -e ".[dev]"
-   ```
-
-   (If transformers was already installed in the virtual environment, remove
-   it with `pip uninstall transformers` before reinstalling it in editable
-   mode with the `-e` flag.)
-
-   Right now, we need an unreleased version of `isort` to avoid a
-   [bug](https://github.com/timothycrosley/isort/pull/1000):
-
-   ```bash
-   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-   ```
-
-5. Develop the features on your branch.
-
-   As you work on the features, you should make sure that the test suite
-   passes:
-
-   ```bash
-   $ make test
-   ```
-
-   `transformers` relies on `black` and `isort` to format its source code
-   consistently. After you make changes, format them with:
-
-   ```bash
-   $ make style
-   ```
-
-   `transformers` also uses `flake8` to check for coding mistakes. Quality
-   control runs in CI, however you can also run the same checks with:
-
-   ```bash
-   $ make quality
-   ```
-
-   Once you're happy with your changes, add changed files using `git add` and
-   make a commit with `git commit` to record your changes locally:
-
-   ```bash
-   $ git add modified_file.py
-   $ git commit
-   ```
-
-   Please write [good commit
-   messages](https://chris.beams.io/posts/git-commit/).
-
-   It is a good idea to sync your copy of the code with the original
-   repository regularly. This way you can quickly account for changes:
-
-   ```bash
-   $ git fetch upstream
-   $ git rebase upstream/master
-   ```
-
-   Push the changes to your account using:
-
-   ```bash
-   $ git push -u origin a-descriptive-name-for-my-changes
-   ```
-
-6. Once you are satisfied (**and the checklist below is happy too**), go to the
-   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
-   to the project maintainers for review.
-
-7. It's ok if maintainers ask you for changes. It happens to core contributors
-   too! So everyone can see the changes in the Pull request, work in your local
-   branch and push the changes to your fork. They will automatically appear in
-   the pull request.
-
-
-### Checklist
-
-1. The title of your pull request should be a summary of its contribution;
-2. If your pull request adresses an issue, please mention the issue number in
-   the pull request description to make sure they are linked (and people
-   consulting the issue know you are working on it);
-3. To indicate a work in progress please prefix the title with `[WIP]`. These
-   are useful to avoid duplicated work, and to differentiate it from PRs ready
-   to be merged;
-4. Make sure pre-existing tests still pass;
-5. Add high-coverage tests. No quality test, no merge;
-6. All public methods must have informative docstrings;
-
-
-### Tests
-
-You can run 🤗 Transformers tests with `unittest` or `pytest`.
-
-We like `pytest` and `pytest-xdist` because it's faster. From the root of the
-repository, here's how to run tests with `pytest` for the library:
-
-```bash
-$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
-```
-
-and for the examples:
-
-```bash
-$ pip install -r examples/requirements.txt  # only needed the first time
-$ python -m pytest -n auto --dist=loadfile -s -v ./examples/
-```
-
-In fact, that's how `make test` and `make test-examples` are implemented!
-
-You can specify a smaller set of tests in order to test only the feature
-you're working on.
-
-By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
-`yes` to run them. This will download many gigabytes of models — make sure you
-have enough disk space and a good Internet connection, or a lot of patience!
-
-```bash
-$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
-$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/
-```
-
-Likewise, set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run
-tests for custom tokenizers, which don't run by default either.
-
-🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
-`pytest`-specific features in the test suite itself.
-
-This means `unittest` is fully supported. Here's how to run tests with
-`unittest`:
-
-```bash
-$ python -m unittest discover -s tests -t . -v
-$ python -m unittest discover -s examples -t examples -v
-```
-
-
-### Style guide
-
-For documentation strings, `transformers` follows the [google
-style](https://google.github.io/styleguide/pyguide.html).
-
-#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
diff --git a/server/transformers/LICENSE b/server/transformers/LICENSE
deleted file mode 100644
index d645695673349e3947e8e5ae42332d0ac3164cd7..0000000000000000000000000000000000000000
--- a/server/transformers/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/server/transformers/MANIFEST.in b/server/transformers/MANIFEST.in
deleted file mode 100644
index 1aba38f67a2211cf5b09466d7b411206cb7223bf..0000000000000000000000000000000000000000
--- a/server/transformers/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-include LICENSE
diff --git a/server/transformers/Makefile b/server/transformers/Makefile
deleted file mode 100644
index dc2a6491ee872fe2e206c3c410ac71bbfc664ec5..0000000000000000000000000000000000000000
--- a/server/transformers/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-.PHONY: quality style test test-examples
-
-# Check that source code meets quality standards
-
-quality:
-	black --check --line-length 119 --target-version py35 examples templates tests src utils
-	isort --check-only --recursive examples templates tests src utils
-	flake8 examples templates tests src utils
-
-# Format source code automatically
-
-style:
-	black --line-length 119 --target-version py35 examples templates tests src utils
-	isort --recursive examples templates tests src utils
-
-# Run tests for the library
-
-test:
-	python -m pytest -n auto --dist=loadfile -s -v ./tests/
-
-# Run tests for examples
-
-test-examples:
-	python -m pytest -n auto --dist=loadfile -s -v ./examples/
diff --git a/server/transformers/README.md b/server/transformers/README.md
deleted file mode 100644
index 2d31c823d3d5457f397fc026627919fe302920e1..0000000000000000000000000000000000000000
--- a/server/transformers/README.md
+++ /dev/null
@@ -1,684 +0,0 @@
-<p align="center">
-    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
-    <br>
-<p>
-<p align="center">
-    <a href="https://circleci.com/gh/huggingface/transformers">
-        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
-    </a>
-    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
-        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
-    </a>
-    <a href="https://huggingface.co/transformers/index.html">
-        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
-    </a>
-    <a href="https://github.com/huggingface/transformers/releases">
-        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
-    </a>
-</p>
-
-<h3 align="center">
-<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
-</h3>
-
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-
-### Features
-
-- As easy to use as pytorch-transformers
-- As powerful and concise as Keras
-- High performance on NLU and NLG tasks
-- Low barrier to entry for educators and practitioners
-
-State-of-the-art NLP for everyone
-- Deep learning researchers
-- Hands-on practitioners
-- AI/ML/NLP teachers and educators
-
-Lower compute costs, smaller carbon footprint
-- Researchers can share trained models instead of always retraining
-- Practitioners can reduce compute time and production costs
-- 10 architectures with over 30 pretrained models, some in more than 100 languages
-
-Choose the right framework for every part of a model's lifetime
-- Train state-of-the-art models in 3 lines of code
-- Deep interoperability between TensorFlow 2.0 and PyTorch models
-- Move a single model between TF2.0/PyTorch frameworks at will
-- Seamlessly pick the right framework for training, evaluation, production
-
-
-| Section | Description |
-|-|-|
-| [Installation](#installation) | How to install the package |
-| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
-| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
-| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
-| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
-| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.4.0)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
-
-## Installation
-
-This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
-
-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-
-Create a virtual environment with the version of Python you're going to use and activate it.
-
-Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you must install it from source.
-
-### With pip
-
-First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
-
-When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
-
-```bash
-pip install transformers
-```
-
-### From source
-
-Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
-
-When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-When you update the repository, you should upgrade the transformers installation and its dependencies as follows:
-
-```bash
-git pull
-pip install --upgrade .
-```
-
-### Run the examples
-
-Examples are included in the repository but are not shipped with the library.
-
-Therefore, in order to run the latest versions of the examples, you need to install from source, as described above.
-
-Look at the [README](https://github.com/huggingface/transformers/blob/master/examples/README.md) for how to run examples.
-
-### Tests
-
-A series of tests are included for the library and for some example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
-
-Here's the easiest way to run tests for the library:
-
-```bash
-pip install -e ".[testing]"
-make test
-```
-
-and for the examples:
-
-```bash
-pip install -e ".[testing]"
-pip install -r examples/requirements.txt
-make test-examples
-```
-
-For details, refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests).
-
-### Do you want to run a Transformer model on a mobile device?
-
-You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
-
-## Model architectures
-
-🤗 Transformers currently provides the following NLU/NLG architectures:
-
-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-17. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
-
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
-
-## Online demo
-
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
-You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
-
-> “🦄 Write with transformer is to writing what calculators are to calculus.”
-
-![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)
-
-## Quick tour
-
-Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
-
-```python
-import torch
-from transformers import *
-
-# Transformers has a unified API
-# for 10 transformer architectures and 30 pretrained weights.
-#          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
-          (CTRLModel,       CTRLTokenizer,       'ctrl'),
-          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
-          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
-         ]
-
-# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
-
-# Let's encode some text in a sequence of hidden-states using each model:
-for model_class, tokenizer_class, pretrained_weights in MODELS:
-    # Load pretrained model/tokenizer
-    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
-    model = model_class.from_pretrained(pretrained_weights)
-
-    # Encode text
-    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-    with torch.no_grad():
-        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
-
-# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
-BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
-                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]
-
-# All the classes for an architecture can be initiated from pretrained weights for this architecture
-# Note that additional weights added for fine-tuning are only initialized
-# and need to be trained on the down-stream task
-pretrained_weights = 'bert-base-uncased'
-tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
-for model_class in BERT_MODEL_CLASSES:
-    # Load pretrained model/tokenizer
-    model = model_class.from_pretrained(pretrained_weights)
-
-    # Models can return full list of hidden-states & attentions weights at each layer
-    model = model_class.from_pretrained(pretrained_weights,
-                                        output_hidden_states=True,
-                                        output_attentions=True)
-    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-    all_hidden_states, all_attentions = model(input_ids)[-2:]
-
-    # Models are compatible with Torchscript
-    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-    traced_model = torch.jit.trace(model, (input_ids,))
-
-    # Simple serialization for models and tokenizers
-    model.save_pretrained('./directory/to/save/')  # save
-    model = model_class.from_pretrained('./directory/to/save/')  # re-load
-    tokenizer.save_pretrained('./directory/to/save/')  # save
-    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load
-
-    # SOTA examples for GLUE, SQUAD, text generation...
-```
-
-## Quick tour TF 2.0 training and PyTorch interoperability
-
-Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
-
-```python
-import tensorflow as tf
-import tensorflow_datasets
-from transformers import *
-
-# Load dataset, tokenizer, model from pretrained model/vocabulary
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
-
-# Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
-
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-# Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
-
-# Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
-
-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
-
-pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
-pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
-
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
-```
-
-## Quick tour of the fine-tuning/usage scripts
-
-**Important**
-Before running the fine-tuning scripts, please read the
-[instructions](#run-the-examples) on how to
-setup your environment to run the examples.
-
-The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
-
-- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
-- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
-- `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
-- other model-specific examples (see the documentation).
-
-Here are three quick usage examples for these scripts:
-
-### `run_glue.py`: Fine-tuning on GLUE tasks for sequence classification
-
-The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-You should also install the additional packages required by the examples:
-
-```shell
-pip install -r ./examples/requirements.txt
-```
-
-```shell
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python ./examples/run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --task_name $TASK_NAME \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/$TASK_NAME \
-    --max_seq_length 128 \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-#### Fine-tuning XLNet model on the STS-B regression task
-
-This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
-Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below).
-
-```shell
-export GLUE_DIR=/path/to/glue
-
-python ./examples/run_glue.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train  \
-    --do_eval   \
-    --task_name=sts-b     \
-    --data_dir=${GLUE_DIR}/STS-B  \
-    --output_dir=./proc_data/sts-b-110   \
-    --max_seq_length=128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --gradient_accumulation_steps=1 \
-    --max_steps=1200  \
-    --model_name=xlnet-large-cased   \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-    --warmup_steps=120
-```
-
-On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set.
-
-#### Fine-tuning Bert model on the MRPC classification task
-
-This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py   \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --task_name MRPC \
-    --do_train   \
-    --do_eval   \
-    --do_lower_case   \
-    --data_dir $GLUE_DIR/MRPC/   \
-    --max_seq_length 128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --learning_rate 2e-5   \
-    --num_train_epochs 3.0  \
-    --output_dir /tmp/mrpc_output/ \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-  acc = 0.8823529411764706
-  acc_and_f1 = 0.901702786377709
-  eval_loss = 0.3418912578906332
-  f1 = 0.9210526315789473
-  global_step = 174
-  loss = 0.07231863956341798
-```
-
-### `run_squad.py`: Fine-tuning on SQuAD for question-answering
-
-This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-```
-
-This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-### `run_generation.py`: Text generation with GPT, GPT-2, CTRL, Transformer-XL and XLNet
-
-A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
-
-Here is how to run the script with the small version of OpenAI GPT-2 model:
-
-```shell
-python ./examples/run_generation.py \
-    --model_type=gpt2 \
-    --length=20 \
-    --model_name_or_path=gpt2 \
-```
-
-and from the Salesforce CTRL model:
-```shell
-python ./examples/run_generation.py \
-    --model_type=ctrl \
-    --length=20 \
-    --model_name_or_path=ctrl \
-    --temperature=0 \
-    --repetition_penalty=1.2 \
-```
-
-## Quick tour of model sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
-```python
-"username/pretrained_model"
-```
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
-model = AutoModel.from_pretrained("username/pretrained_model")
-```
-
-Finally, list all your files on S3:
-```shell
-transformers-cli s3 ls
-# List all your S3 objects.
-```
-
-You can also delete files:
-
-```shell
-transformers-cli s3 rm …
-```
-
-## Quick tour of pipelines
-
-New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
-and outputting the result in a structured object.
-
-You can create `Pipeline` objects for the following down-stream tasks:
-
- - `feature-extraction`: Generates a tensor representation for the input sequence
- - `ner`: Generates named entity mapping for each word in the input sequence.
- - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
- - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
- - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
- - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
-
-```python
-from transformers import pipeline
-
-# Allocate a pipeline for sentiment-analysis
-nlp = pipeline('sentiment-analysis')
-nlp('We are very happy to include pipeline into the transformers repository.')
->>> {'label': 'POSITIVE', 'score': 0.99893874}
-
-# Allocate a pipeline for question-answering
-nlp = pipeline('question-answering')
-nlp({
-    'question': 'What is the name of the repository ?',
-    'context': 'Pipeline have been included in the huggingface/transformers repository'
-})
->>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
-```
-
-## Migrating from pytorch-transformers to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
-
-### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
-
-To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
-
-If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
-
-If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
-
-
-## Migrating from pytorch-pretrained-bert to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
-
-### Models always output `tuples`
-
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that every model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-
-The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
-
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-
-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
-
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
-
-# Now just use this line in transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
-
-# In transformers you can also have access to the logits:
-loss, logits = outputs[:2]
-
-# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
-
-### Using hidden states
-
-By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
-
-### Serialization
-
-Breaking change in the `from_pretrained()` method:
-
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them, don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead, which can break derived model classes built based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model's `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
-
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
-
-Here is an example:
-
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
-```
-
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
-- it only implements weights decay correction,
-- schedules are now externals (see below),
-- gradient clipping is now also external (see below).
-
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
-
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
-
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
-
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_training_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    optimizer.step()
-
-### In Transformers, optimizer and schedules are splitted and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
-### and used like this:
-for batch in train_data:
-    model.train()
-    loss = model(batch)
-    loss.backward()
-    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    optimizer.step()
-    scheduler.step()
-    optimizer.zero_grad()
-```
-
-## Citation
-
-We now have a paper you can cite for the 🤗 Transformers library:
-```
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
-}
-```
diff --git a/server/transformers/deploy_multi_version_doc.sh b/server/transformers/deploy_multi_version_doc.sh
deleted file mode 100644
index 37c5de114f0cf44a71b8a86ea3fd8eb39ddf1338..0000000000000000000000000000000000000000
--- a/server/transformers/deploy_multi_version_doc.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-cd docs
-
-function deploy_doc(){
-	echo "Creating doc at commit $1 and pushing to folder $2"
-	git checkout $1
-	if [ ! -z "$2" ] 
-	then
-		echo "Pushing version" $2
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
-	else
-		echo "Pushing master"
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
-	fi
-}
-
-deploy_doc "master" 
-deploy_doc "b33a385" v1.0.0
-deploy_doc "fe02e45" v1.1.0
-deploy_doc "89fd345" v1.2.0
-deploy_doc "fc9faa8" v2.0.0
-deploy_doc "3ddce1d" v2.1.1
-deploy_doc "f2f3294" v2.2.0
-deploy_doc "d0f8b9a" v2.3.0
diff --git a/server/transformers/docker/Dockerfile b/server/transformers/docker/Dockerfile
deleted file mode 100644
index fed834ff88e89ee21e0919b068b0ead5b24984c6..0000000000000000000000000000000000000000
--- a/server/transformers/docker/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM pytorch/pytorch:latest
-
-RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-
-RUN pip install transformers
-
-WORKDIR /workspace
\ No newline at end of file
diff --git a/server/transformers/docs/Makefile b/server/transformers/docs/Makefile
deleted file mode 100644
index 8879933e6cda150267451c9e7d07dd22b7b0d3f1..0000000000000000000000000000000000000000
--- a/server/transformers/docs/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/server/transformers/docs/README.md b/server/transformers/docs/README.md
deleted file mode 100644
index d1a8b24103ba562cfa630e4926910d3254872a8f..0000000000000000000000000000000000000000
--- a/server/transformers/docs/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Generating the documentation
-
-To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
-you can install them with the following command, at the root of the code repository:
-
-```bash
-pip install -e ".[docs]"
-```
-
-## Packages installed
-
-Here's an overview of all the packages installed. If you ran the previous command installing all packages from
-`requirements.txt`, you do not need to run the following commands.
-
-Building it requires the package `sphinx` that you can
-install using:
-
-```bash
-pip install -U sphinx
-```
-
-You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
-[Read The Docs](https://readthedocs.org/). You can install it using the following command:
-
-```bash
-pip install sphinx_rtd_theme
-```
-
-The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
-
-```bash
-pip install recommonmark
-```
-
-## Building the documentation
-
-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
-command to generate it:
-
-```bash
-ln -s ../../examples/README.md examples.md
-```
-
-Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
-
-```bash
-make html
-```
-
----
-**NOTE**
-
-If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
-directory before rebuilding. Run the following command to clean and build:
-
-```bash
-make clean && make html
-```
-
----
-
-It should build the static app that will be available under `/docs/_build/html`
-
-## Adding a new element to the tree (toc-tree)
-
-Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
-in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
diff --git a/server/transformers/docs/source/_static/css/Calibre-Light.ttf b/server/transformers/docs/source/_static/css/Calibre-Light.ttf
deleted file mode 100644
index 2e6631909a671e74db99044a7a1dad512df82207..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/_static/css/Calibre-Light.ttf and /dev/null differ
diff --git a/server/transformers/docs/source/_static/css/Calibre-Medium.otf b/server/transformers/docs/source/_static/css/Calibre-Medium.otf
deleted file mode 100644
index f9f11ebe430e3745b7b363078530cd6305f04ebc..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/_static/css/Calibre-Medium.otf and /dev/null differ
diff --git a/server/transformers/docs/source/_static/css/Calibre-Regular.otf b/server/transformers/docs/source/_static/css/Calibre-Regular.otf
deleted file mode 100644
index 3801b704cc8b83ee419b44b160b4d2105f4e52f8..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/_static/css/Calibre-Regular.otf and /dev/null differ
diff --git a/server/transformers/docs/source/_static/css/Calibre-Thin.otf b/server/transformers/docs/source/_static/css/Calibre-Thin.otf
deleted file mode 100644
index 44f93821ee80e78a1a8d9aa92b319d29ea01240c..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/_static/css/Calibre-Thin.otf and /dev/null differ
diff --git a/server/transformers/docs/source/_static/css/code-snippets.css b/server/transformers/docs/source/_static/css/code-snippets.css
deleted file mode 100644
index 43acc6751c5ca59a16889bfffc471eb566f93af5..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/_static/css/code-snippets.css
+++ /dev/null
@@ -1,12 +0,0 @@
-
-.highlight .c1, .highlight .sd{
-    color: #999
-}
-
-.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
-    color: #FB8D68;
-}
-
-.highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
-    color: #6670FF;
-}
\ No newline at end of file
diff --git a/server/transformers/docs/source/_static/css/huggingface.css b/server/transformers/docs/source/_static/css/huggingface.css
deleted file mode 100644
index 3f006a996ba80f53048e01dcad9a28a6f22dc937..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/_static/css/huggingface.css
+++ /dev/null
@@ -1,196 +0,0 @@
-/* The literal code blocks */
-.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
-    color: #6670FF;
-}
-
-/* To keep the logo centered */
-.wy-side-scroll {
-    width: auto;
-    font-size: 20px;
-}
-
-/* The div that holds the Hugging Face logo */
-.HuggingFaceDiv {
-    width: 100%
-}
-
-/* The research field on top of the toc tree */
-.wy-side-nav-search{
-    background-color: #6670FF;
-}
-
-/* The toc tree */
-.wy-nav-side{
-    background-color: #6670FF;
-}
-
-/* The selected items in the toc tree */
-.wy-menu-vertical li.current{
-    background-color: #A6B0FF;
-}
-
-/* When a list item that does belong to the selected block from the toc tree is hovered */
-.wy-menu-vertical li.current a:hover{
-    background-color: #B6C0FF;
-}
-
-/* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
-.wy-menu-vertical li a:hover{
-    background-color: #A7AFFB;
-}
-
-/* The text items on the toc tree */
-.wy-menu-vertical a {
-    color: #FFFFDD;
-    font-family: Calibre-Light, sans-serif;
-}
-.wy-menu-vertical header, .wy-menu-vertical p.caption{
-    color: white;
-    font-family: Calibre-Light, sans-serif;
-}
-
-/* The color inside the selected toc tree block */
-.wy-menu-vertical li.toctree-l2 a, .wy-menu-vertical li.toctree-l3 a, .wy-menu-vertical li.toctree-l4 a {
-    color: black;
-}
-
-/* Inside the depth-2 selected toc tree block */
-.wy-menu-vertical li.toctree-l2.current>a {
-    background-color: #B6C0FF
-}
-.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {
-    background-color: #C6D0FF
-}
-
-/* Inside the depth-3 selected toc tree block */
-.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{
-    background-color: #D6E0FF
-}
-
-/* Inside code snippets */
-.rst-content dl:not(.docutils) dt{
-    font-size: 15px;
-}
-
-/* Links */
-a {
-    color: #6670FF;
-}
-
-/* Content bars */
-.rst-content dl:not(.docutils) dt {
-    background-color: rgba(251, 141, 104, 0.1);
-    border-right: solid 2px #FB8D68;
-    border-left: solid 2px #FB8D68;
-    color: #FB8D68;
-    font-family: Calibre-Light, sans-serif;
-    border-top: none;
-    font-style: normal !important;
-}
-
-/* Expand button */
-.wy-menu-vertical li.toctree-l2 span.toctree-expand,
-.wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li.current>a span.toctree-expand,
-.wy-menu-vertical li.toctree-l3 span.toctree-expand{
-    color: black;
-}
-
-/* Max window size */
-.wy-nav-content{
-    max-width: 1200px;
-}
-
-/* Mobile header */
-.wy-nav-top{
-    background-color: #6670FF;
-}
-
-
-/* Source spans */
-.rst-content .viewcode-link, .rst-content .viewcode-back{
-    color: #6670FF;
-    font-size: 110%;
-    letter-spacing: 2px;
-    text-transform: uppercase;
-}
-
-/* It would be better for table to be visible without horizontal scrolling */
-.wy-table-responsive table td, .wy-table-responsive table th{
-    white-space: normal;
-}
-
-.footer {
-    margin-top: 20px;
-}
-
-.footer__Social {
-    display: flex;
-    flex-direction: row;
-}
-
-.footer__CustomImage {
-    margin: 2px 5px 0 0;
-}
-
-/* class and method names in doc */
-.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
-    font-family: Calibre, sans-serif;
-    font-size: 20px !important;
-}
-
-/* class name in doc*/
-.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
-    margin-right: 10px;
-    font-family: Calibre-Medium, sans-serif;
-}
-
-/* Method and class parameters */
-.sig-param{
-    line-height: 23px;
-}
-
-/* Class introduction "class" string at beginning */
-.rst-content dl:not(.docutils) .property{
-    font-size: 18px;
-    color: black;
-}
-
-
-/* FONTS */
-body{
-    font-family: Calibre, sans-serif;
-    font-size: 16px;
-}
-
-h1 {
-    font-family: Calibre-Thin, sans-serif;
-    font-size: 70px;
-}
-
-h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
-    font-family: Calibre-Medium, sans-serif;
-}
-
-@font-face {
-    font-family: Calibre-Medium;
-    src: url(./Calibre-Medium.otf);
-    font-weight:400;
-}
-
-@font-face {
-    font-family: Calibre;
-    src: url(./Calibre-Regular.otf);
-    font-weight:400;
-}
-
-@font-face {
-    font-family: Calibre-Light;
-    src: url(./Calibre-Light.ttf);
-    font-weight:400;
-}
-
-@font-face {
-    font-family: Calibre-Thin;
-    src: url(./Calibre-Thin.otf);
-    font-weight:400;
-}
diff --git a/server/transformers/docs/source/_static/js/custom.js b/server/transformers/docs/source/_static/js/custom.js
deleted file mode 100644
index ec804b3704a1dc8c3eb021ac4fe6412112856722..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/_static/js/custom.js
+++ /dev/null
@@ -1,79 +0,0 @@
-function addIcon() {
-    const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
-    const image = document.createElement("img");
-    image.setAttribute("src", huggingFaceLogo);
-
-    const div = document.createElement("div");
-    div.appendChild(image);
-    div.style.textAlign = 'center';
-    div.style.paddingTop = '30px';
-    div.style.backgroundColor = '#6670FF';
-
-    const scrollDiv = document.querySelector(".wy-side-scroll");
-    scrollDiv.prepend(div);
-}
-
-function addCustomFooter() {
-    const customFooter = document.createElement("div");
-    const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
-    customFooter.appendChild(questionOrIssue);
-    customFooter.classList.add("footer");
-
-    const social = document.createElement("div");
-    social.classList.add("footer__Social");
-
-    const imageDetails = [
-        { link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
-        { link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
-        { link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
-        { link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
-    ];
-
-    imageDetails.forEach(imageLinks => {
-        const link = document.createElement("a");
-        const image = document.createElement("img");
-        image.src = imageLinks.imageLink;
-        link.href = imageLinks.link;
-        image.style.width = "30px";
-        image.classList.add("footer__CustomImage");
-        link.appendChild(image);
-        social.appendChild(link);
-    });
-
-    customFooter.appendChild(social);
-    document.querySelector("footer").appendChild(customFooter);
-}
-
-function addGithubButton() {
-    const div = `
-        <div class="github-repo">
-            <a 
-                class="github-button"
-                href="https://github.com/huggingface/transformers" data-size="large" data-show-count="true" aria-label="Star huggingface/pytorch-transformers on GitHub">
-                Star
-            </a>
-        </div>
-    `;
-    document.querySelector(".wy-side-nav-search .icon-home").insertAdjacentHTML('afterend', div);
-}
-
-/*!
- * github-buttons v2.2.10
- * (c) 2019 なつき
- * @license BSD-2-Clause
- */
-/**
- * modified to run programmatically
- */
-function parseGithubButtons (){"use strict";var e=window.document,t=e.location,o=window.encodeURIComponent,r=window.decodeURIComponent,n=window.Math,a=window.HTMLElement,i=window.XMLHttpRequest,l="https://unpkg.com/github-buttons@2.2.10/dist/buttons.html",c=i&&i.prototype&&"withCredentials"in i.prototype,d=c&&a&&a.prototype.attachShadow&&!a.prototype.attachShadow.prototype,s=function(e,t,o){e.addEventListener?e.addEventListener(t,o):e.attachEvent("on"+t,o)},u=function(e,t,o){e.removeEventListener?e.removeEventListener(t,o):e.detachEvent("on"+t,o)},h=function(e,t,o){var r=function(n){return u(e,t,r),o(n)};s(e,t,r)},f=function(e,t,o){var r=function(n){if(t.test(e.readyState))return u(e,"readystatechange",r),o(n)};s(e,"readystatechange",r)},p=function(e){return function(t,o,r){var n=e.createElement(t);if(o)for(var a in o){var i=o[a];null!=i&&(null!=n[a]?n[a]=i:n.setAttribute(a,i))}if(r)for(var l=0,c=r.length;l<c;l++){var d=r[l];n.appendChild("string"==typeof d?e.createTextNode(d):d)}return n}},g=p(e),b=function(e){var t;return function(){t||(t=1,e.apply(this,arguments))}},m="body{margin:0}a{color:#24292e;text-decoration:none;outline:0}.octicon{display:inline-block;vertical-align:text-top;fill:currentColor}.widget{ display:inline-block;overflow:hidden;font-family:-apple-system, BlinkMacSystemFont, \"Segoe UI\", Helvetica, Arial, sans-serif;font-size:0;white-space:nowrap;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.btn,.social-count{display:inline-block;height:14px;padding:2px 5px;font-size:11px;font-weight:600;line-height:14px;vertical-align:bottom;cursor:pointer;border:1px solid #c5c9cc;border-radius:0.25em}.btn{background-color:#eff3f6;background-image:-webkit-linear-gradient(top, #fafbfc, #eff3f6 90%);background-image:-moz-linear-gradient(top, #fafbfc, #eff3f6 90%);background-image:linear-gradient(180deg, #fafbfc, #eff3f6 90%);background-position:-1px -1px;background-repeat:repeat-x;background-size:110% 110%;border-color:rgba(27,31,35,0.2);-ms-filter:\"progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFFAFBFC', endColorstr='#FFEEF2F5')\";*filter:progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFFAFBFC', endColorstr='#FFEEF2F5')}.btn:active{background-color:#e9ecef;background-image:none;border-color:#a5a9ac;border-color:rgba(27,31,35,0.35);box-shadow:inset 0 0.15em 0.3em rgba(27,31,35,0.15)}.btn:focus,.btn:hover{background-color:#e6ebf1;background-image:-webkit-linear-gradient(top, #f0f3f6, #e6ebf1 90%);background-image:-moz-linear-gradient(top, #f0f3f6, #e6ebf1 90%);background-image:linear-gradient(180deg, #f0f3f6, #e6ebf1 90%);border-color:#a5a9ac;border-color:rgba(27,31,35,0.35);-ms-filter:\"progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFF0F3F6', endColorstr='#FFE5EAF0')\";*filter:progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFF0F3F6', endColorstr='#FFE5EAF0')}.social-count{position:relative;margin-left:5px;background-color:#fff}.social-count:focus,.social-count:hover{color:#0366d6}.social-count b,.social-count i{position:absolute;top:50%;left:0;display:block;width:0;height:0;margin:-4px 0 0 -4px;border:solid transparent;border-width:4px 4px 4px 0;_line-height:0;_border-top-color:red !important;_border-bottom-color:red !important;_border-left-color:red !important;_filter:chroma(color=red)}.social-count b{border-right-color:#c5c9cc}.social-count i{margin-left:-3px;border-right-color:#fff}.lg .btn,.lg .social-count{height:16px;padding:5px 10px;font-size:12px;line-height:16px}.lg .social-count{margin-left:6px}.lg .social-count b,.lg .social-count i{margin:-5px 0 0 -5px;border-width:5px 5px 5px 0}.lg .social-count i{margin-left:-4px}\n",v={"mark-github":{width:16,height:16,path:'<path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>'},eye:{width:16,height:16,path:'<path fill-rule="evenodd" d="M8.06 2C3 2 0 8 0 8s3 6 8.06 6C13 14 16 8 16 8s-3-6-7.94-6zM8 12c-2.2 0-4-1.78-4-4 0-2.2 1.8-4 4-4 2.22 0 4 1.8 4 4 0 2.22-1.78 4-4 4zm2-4c0 1.11-.89 2-2 2-1.11 0-2-.89-2-2 0-1.11.89-2 2-2 1.11 0 2 .89 2 2z"/>'},star:{width:14,height:16,path:'<path fill-rule="evenodd" d="M14 6l-4.9-.64L7 1 4.9 5.36 0 6l3.6 3.26L2.67 14 7 11.67 11.33 14l-.93-4.74L14 6z"/>'},"repo-forked":{width:10,height:16,path:'<path fill-rule="evenodd" d="M8 1a1.993 1.993 0 0 0-1 3.72V6L5 8 3 6V4.72A1.993 1.993 0 0 0 2 1a1.993 1.993 0 0 0-1 3.72V6.5l3 3v1.78A1.993 1.993 0 0 0 5 15a1.993 1.993 0 0 0 1-3.72V9.5l3-3V4.72A1.993 1.993 0 0 0 8 1zM2 4.2C1.34 4.2.8 3.65.8 3c0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm3 10c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm3-10c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2z"/>'},"issue-opened":{width:14,height:16,path:'<path fill-rule="evenodd" d="M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"/>'},"cloud-download":{width:16,height:16,path:'<path fill-rule="evenodd" d="M9 12h2l-3 3-3-3h2V7h2v5zm3-8c0-.44-.91-3-4.5-3C5.08 1 3 2.92 3 5 1.02 5 0 6.52 0 8c0 1.53 1 3 3 3h3V9.7H3C1.38 9.7 1.3 8.28 1.3 8c0-.17.05-1.7 1.7-1.7h1.3V5c0-1.39 1.56-2.7 3.2-2.7 2.55 0 3.13 1.55 3.2 1.8v1.2H12c.81 0 2.7.22 2.7 2.2 0 2.09-2.25 2.2-2.7 2.2h-2V11h2c2.08 0 4-1.16 4-3.5C16 5.06 14.08 4 12 4z"/>'}},w={},x=function(e,t,o){var r=p(e.ownerDocument),n=e.appendChild(r("style",{type:"text/css"}));n.styleSheet?n.styleSheet.cssText=m:n.appendChild(e.ownerDocument.createTextNode(m));var a,l,d=r("a",{className:"btn",href:t.href,target:"_blank",innerHTML:(a=t["data-icon"],l=/^large$/i.test(t["data-size"])?16:14,a=(""+a).toLowerCase().replace(/^octicon-/,""),{}.hasOwnProperty.call(v,a)||(a="mark-github"),'<svg version="1.1" width="'+l*v[a].width/v[a].height+'" height="'+l+'" viewBox="0 0 '+v[a].width+" "+v[a].height+'" class="octicon octicon-'+a+'" aria-hidden="true">'+v[a].path+"</svg>"),"aria-label":t["aria-label"]||void 0},[" ",r("span",{},[t["data-text"]||""])]);/\.github\.com$/.test("."+d.hostname)?/^https?:\/\/((gist\.)?github\.com\/[^\/?#]+\/[^\/?#]+\/archive\/|github\.com\/[^\/?#]+\/[^\/?#]+\/releases\/download\/|codeload\.github\.com\/)/.test(d.href)&&(d.target="_top"):(d.href="#",d.target="_self");var u,h,g,x,y=e.appendChild(r("div",{className:"widget"+(/^large$/i.test(t["data-size"])?" lg":"")},[d]));/^(true|1)$/i.test(t["data-show-count"])&&"github.com"===d.hostname&&(u=d.pathname.replace(/^(?!\/)/,"/").match(/^\/([^\/?#]+)(?:\/([^\/?#]+)(?:\/(?:(subscription)|(fork)|(issues)|([^\/?#]+)))?)?(?:[\/?#]|$)/))&&!u[6]?(u[2]?(h="/repos/"+u[1]+"/"+u[2],u[3]?(x="subscribers_count",g="watchers"):u[4]?(x="forks_count",g="network"):u[5]?(x="open_issues_count",g="issues"):(x="stargazers_count",g="stargazers")):(h="/users/"+u[1],g=x="followers"),function(e,t){var o=w[e]||(w[e]=[]);if(!(o.push(t)>1)){var r=b(function(){for(delete w[e];t=o.shift();)t.apply(null,arguments)});if(c){var n=new i;s(n,"abort",r),s(n,"error",r),s(n,"load",function(){var e;try{e=JSON.parse(n.responseText)}catch(e){return void r(e)}r(200!==n.status,e)}),n.open("GET",e),n.send()}else{var a=this||window;a._=function(e){a._=null,r(200!==e.meta.status,e.data)};var l=p(a.document)("script",{async:!0,src:e+(/\?/.test(e)?"&":"?")+"callback=_"}),d=function(){a._&&a._({meta:{}})};s(l,"load",d),s(l,"error",d),l.readyState&&f(l,/de|m/,d),a.document.getElementsByTagName("head")[0].appendChild(l)}}}.call(this,"https://api.github.com"+h,function(e,t){if(!e){var n=t[x];y.appendChild(r("a",{className:"social-count",href:t.html_url+"/"+g,target:"_blank","aria-label":n+" "+x.replace(/_count$/,"").replace("_"," ").slice(0,n<2?-1:void 0)+" on GitHub"},[r("b"),r("i"),r("span",{},[(""+n).replace(/\B(?=(\d{3})+(?!\d))/g,",")])]))}o&&o(y)})):o&&o(y)},y=window.devicePixelRatio||1,C=function(e){return(y>1?n.ceil(n.round(e*y)/y*2)/2:n.ceil(e))||0},F=function(e,t){e.style.width=t[0]+"px",e.style.height=t[1]+"px"},k=function(t,r){if(null!=t&&null!=r)if(t.getAttribute&&(t=function(e){for(var t={href:e.href,title:e.title,"aria-label":e.getAttribute("aria-label")},o=["icon","text","size","show-count"],r=0,n=o.length;r<n;r++){var a="data-"+o[r];t[a]=e.getAttribute(a)}return null==t["data-text"]&&(t["data-text"]=e.textContent||e.innerText),t}(t)),d){var a=g("span",{title:t.title||void 0});x(a.attachShadow({mode:"closed"}),t,function(){r(a)})}else{var i=g("iframe",{src:"javascript:0",title:t.title||void 0,allowtransparency:!0,scrolling:"no",frameBorder:0});F(i,[0,0]),i.style.border="none";var c=function(){var a,d=i.contentWindow;try{a=d.document.body}catch(t){return void e.body.appendChild(i.parentNode.removeChild(i))}u(i,"load",c),x.call(d,a,t,function(e){var a=function(e){var t=e.offsetWidth,o=e.offsetHeight;if(e.getBoundingClientRect){var r=e.getBoundingClientRect();t=n.max(t,C(r.width)),o=n.max(o,C(r.height))}return[t,o]}(e);i.parentNode.removeChild(i),h(i,"load",function(){F(i,a)}),i.src=l+"#"+(i.name=function(e){var t=[];for(var r in e){var n=e[r];null!=n&&t.push(o(r)+"="+o(n))}return t.join("&")}(t)),r(i)})};s(i,"load",c),e.body.appendChild(i)}};t.protocol+"//"+t.host+t.pathname===l?x(e.body,function(e){for(var t={},o=e.split("&"),n=0,a=o.length;n<a;n++){var i=o[n];if(""!==i){var l=i.split("=");t[r(l[0])]=null!=l[1]?r(l.slice(1).join("=")):void 0}}return t}(window.name||t.hash.replace(/^#/,""))):function(t){if(/m/.test(e.readyState)||!/g/.test(e.readyState)&&!e.documentElement.doScroll)setTimeout(t);else if(e.addEventListener){var o=b(t);h(e,"DOMContentLoaded",o),h(window,"load",o)}else f(e,/m/,t)}(function(){for(var t=e.querySelectorAll?e.querySelectorAll("a.github-button"):function(){for(var t=[],o=e.getElementsByTagName("a"),r=0,n=o.length;r<n;r++)~(" "+o[r].className+" ").replace(/[ \t\n\f\r]+/g," ").indexOf(" github-button ")&&t.push(o[r]);return t}(),o=0,r=t.length;o<r;o++)!function(e){k(e,function(t){e.parentNode.replaceChild(t,e)})}(t[o])})};
-
-
-function onLoad() {
-    addIcon();
-    addCustomFooter();
-    addGithubButton();
-    parseGithubButtons();
-}
-
-window.addEventListener("load", onLoad);
diff --git a/server/transformers/docs/source/_static/js/huggingface_logo.svg b/server/transformers/docs/source/_static/js/huggingface_logo.svg
deleted file mode 100644
index 84974866ce772648e08e0bd4fa71d349152895c5..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/_static/js/huggingface_logo.svg
+++ /dev/null
@@ -1,47 +0,0 @@
-<svg width="95px" height="88px" viewBox="0 0 95 88" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-    <!-- Generator: Sketch 43.2 (39069) - http://www.bohemiancoding.com/sketch -->
-    <title>icon</title>
-    <desc>Created with Sketch.</desc>
-    <defs>
-        <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
-    </defs>
-    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
-        <g id="icon_desktop">
-            <g id="icon">
-                <g id="icon_desktop">
-                    <g id="Group-2">
-                        <g id="Group">
-                            <path d="M93.7930402,70.08 C94.5430402,72.24 94.3630402,74.54 93.3630402,76.54 C92.6430402,78 91.6130402,79.13 90.3530402,80.14 C88.8330402,81.34 86.9430402,82.36 84.6630402,83.34 C81.9430402,84.5 78.6230402,85.59 77.1030402,85.99 C73.2130402,87 69.4730402,87.64 65.6830402,87.67 C60.2630402,87.72 55.5930402,86.44 52.2730402,83.17 C50.5530402,83.38 48.8130402,83.5 47.0630402,83.5 C45.4030402,83.5 43.7630402,83.4 42.1330402,83.2 C38.8030402,86.45 34.1530402,87.72 28.7530402,87.67 C24.9630402,87.64 21.2230402,87 17.3230402,85.99 C15.8130402,85.59 12.4930402,84.5 9.77304019,83.34 C7.49304019,82.36 5.60304019,81.34 4.09304019,80.14 C2.82304019,79.13 1.79304019,78 1.07304019,76.54 C0.0830401858,74.54 -0.106959814,72.24 0.653040186,70.08 C-0.0469598142,68.43 -0.226959814,66.54 0.323040186,64.45 C0.573040186,63.5 0.983040186,62.62 1.50304019,61.84 C1.39304019,61.43 1.30304019,61.01 1.24304019,60.55 C0.863040186,57.81 1.81304019,55.31 3.60304019,53.37 C4.48304019,52.4 5.43304019,51.73 6.42304019,51.3 C5.69304019,48.2 5.31304019,45.01 5.31304019,41.75 C5.31304019,18.69 24.0030402,0 47.0630402,0 C54.9830402,0 62.3930402,2.2 68.7130402,6.04 C69.8530402,6.74 70.9730402,7.49 72.0430402,8.29 C72.5730402,8.69 73.1030402,9.1 73.6130402,9.53 C74.1330402,9.95 74.6430402,10.39 75.1330402,10.84 C76.6130402,12.19 78.0030402,13.64 79.2730402,15.19 C79.7030402,15.7 80.1130402,16.23 80.5130402,16.77 C81.3230402,17.84 82.0730402,18.95 82.7630402,20.1 C83.8130402,21.82 84.7330402,23.62 85.5330402,25.49 C86.0630402,26.74 86.5230402,28.02 86.9330402,29.33 C87.5430402,31.29 88.0130402,33.31 88.3330402,35.39 C88.4330402,36.08 88.5230402,36.78 88.5930402,37.48 C88.7330402,38.88 88.8130402,40.3 88.8130402,41.75 C88.8130402,44.97 88.4330402,48.13 87.7230402,51.18 C88.8230402,51.61 89.8630402,52.31 90.8330402,53.37 C92.6230402,55.31 93.5730402,57.82 93.1930402,60.56 C93.1330402,61.01 93.0430402,61.43 92.9330402,61.84 C93.4530402,62.62 93.8630402,63.5 94.1130402,64.45 C94.6630402,66.54 94.4830402,68.43 93.7930402,70.08" id="Fill-1" fill="#FFFFFF" fill-rule="nonzero"></path>
-                            <circle id="Oval" fill="#FFD21E" fill-rule="nonzero" cx="46.75" cy="41.75" r="34.75"></circle>
-                            <path d="M81.5,41.75 C81.5,22.5581049 65.9418951,7 46.75,7 C27.5581049,7 12,22.5581049 12,41.75 C12,60.9418951 27.5581049,76.5 46.75,76.5 C65.9418951,76.5 81.5,60.9418951 81.5,41.75 Z M8,41.75 C8,20.3489659 25.3489659,3 46.75,3 C68.1510341,3 85.5,20.3489659 85.5,41.75 C85.5,63.1510341 68.1510341,80.5 46.75,80.5 C25.3489659,80.5 8,63.1510341 8,41.75 Z" id="Oval" fill="#FFAC03" fill-rule="nonzero"></path>
-                            <path d="M57.1723547,31.7151181 C58.0863134,32.7107502 57.3040427,35.2620959 58.7620957,35.2620959 C61.5235194,35.2620959 63.7620957,33.0235196 63.7620957,30.2620959 C63.7620957,27.5006721 61.5235194,25.2620959 58.7620957,25.2620959 C56.0006719,25.2620959 53.7620957,27.5006721 53.7620957,30.2620959 C53.7620957,31.5654666 56.3553563,30.8251108 57.1723547,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(58.762096, 30.262096) rotate(-28.000000) translate(-58.762096, -30.262096) "></path>
-                            <path d="M32.1723553,31.7151181 C33.086314,32.7107502 32.3040433,35.2620959 33.7620963,35.2620959 C36.52352,35.2620959 38.7620963,33.0235196 38.7620963,30.2620959 C38.7620963,27.5006721 36.52352,25.2620959 33.7620963,25.2620959 C31.0006725,25.2620959 28.7620963,27.5006721 28.7620963,30.2620959 C28.7620963,31.5654666 31.3553569,30.8251108 32.1723553,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(33.762096, 30.262096) scale(-1, 1) rotate(-28.000000) translate(-33.762096, -30.262096) "></path>
-                            <g id="Oval-4" transform="translate(33.500000, 41.500000)">
-                                <g id="Mask" fill-rule="nonzero" fill="#3A3B45">
-                                    <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
-                                </g>
-                                <g id="Clipped">
-                                    <mask id="mask-2" fill="white">
-                                        <use xlink:href="#path-1"></use>
-                                    </mask>
-                                    <g id="path-1"></g>
-                                    <path d="M13.25,25 C18.0399291,25 21.9229338,21.1169953 21.9229338,16.3270662 C21.9229338,12.5962324 19.5672252,9.41560375 16.2620987,8.19147116 C16.1404592,8.14641904 16.0175337,8.10401696 15.8933923,8.06433503 C15.0599892,7.79793679 14.1717882,10.6623144 13.25,10.6623144 C12.3886883,10.6623144 11.5567012,7.77968641 10.7713426,8.01349068 C7.18916268,9.07991937 4.57706621,12.3984489 4.57706621,16.3270662 C4.57706621,21.1169953 8.46007093,25 13.25,25 Z" id="Shape" fill="#EF4E4E" fill-rule="nonzero" mask="url(#mask-2)"></path>
-                                </g>
-                            </g>
-                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="70.25" cy="33.75" r="3.25"></circle>
-                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="23.75" cy="33.75" r="3.25"></circle>
-                        </g>
-                    </g>
-                </g>
-                <g id="Group-4" transform="translate(3.000000, 48.000000)" fill-rule="nonzero">
-                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
-                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
-                </g>
-                <g id="Group-4" transform="translate(70.500000, 66.500000) scale(-1, 1) translate(-70.500000, -66.500000) translate(50.000000, 48.000000)" fill-rule="nonzero">
-                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
-                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
-                </g>
-            </g>
-        </g>
-    </g>
-</svg>
\ No newline at end of file
diff --git a/server/transformers/docs/source/benchmarks.md b/server/transformers/docs/source/benchmarks.md
deleted file mode 100644
index decbac47b754e895d87b3130f33f1f2195b65036..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/benchmarks.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Benchmarks
-
-This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
-benchmark will help keep track of the preformance improvements that are brought to our models across versions.
-
-## Benchmarking all models for inference
-
-As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
-and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
-TensorFlow XLA) and GPUs.
-
-The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
-
-The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
-
-## TF2 with mixed precision, XLA, Distribution (@tlkh)
-
-This work was done by [Timothy Liu](https://github.com/tlkh).
-
-There are very positive results to be gained from the various TensorFlow 2.0 features:
-
-- Automatic Mixed Precision (AMP)
-- XLA compiler
-- Distribution strategies (multi-GPU)
-
-The benefits are listed here (tested on CoLA, MRPC, SST-2):
-
-- AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
-- AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
-- Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
-- Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
-
-The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
-on a single GPU gives the following results:
-
-- CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
-- MRPC: AMP results in lower acc (0.823 vs 0.835)
-- SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
-
-However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
-
-CoLA: AMP results in higher acc (0.828 vs 0.812)
-MRPC: AMP results in lower acc (0.817 vs 0.827)
-SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
-
-The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
-
-Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
-as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
-can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
-
-The benefits as seen on SST-2 (larger dataset) is much clear.
-
-All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
diff --git a/server/transformers/docs/source/bertology.rst b/server/transformers/docs/source/bertology.rst
deleted file mode 100644
index c3d1b2f8b83e99510a45623492d0f2cb1a3b2dca..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/bertology.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-BERTology
----------
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
-
-
-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
-* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
-
-
-* accessing all the hidden-states of BERT/GPT/GPT-2,
-* accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
diff --git a/server/transformers/docs/source/conf.py b/server/transformers/docs/source/conf.py
deleted file mode 100644
index 65552cd14b0a88a050b929be2e4f1127a0366175..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/conf.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../../src'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = u'transformers'
-copyright = u'2019, huggingface'
-author = u'huggingface'
-
-# The short X.Y version
-version = u''
-# The full version, including alpha/beta/rc tags
-release = u'2.4.1'
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'recommonmark',
-    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-source_suffix = ['.rst', '.md']
-# source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-html_theme_options = {
-    'analytics_id': 'UA-83738774-2'
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'transformersdoc'
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'transformers.tex', u'transformers Documentation',
-     u'huggingface', 'manual'),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     author, 'transformers', 'One line description of project.',
-     'Miscellaneous'),
-]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
-
-def setup(app):
-    app.add_stylesheet('css/huggingface.css')
-    app.add_stylesheet('css/code-snippets.css')
-    app.add_js_file('js/custom.js')
-
-# -- Extension configuration -------------------------------------------------
diff --git a/server/transformers/docs/source/converting_tensorflow_models.rst b/server/transformers/docs/source/converting_tensorflow_models.rst
deleted file mode 100644
index 595f134fb227c20e13e84906e1bf7f4d73231880..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/converting_tensorflow_models.rst
+++ /dev/null
@@ -1,137 +0,0 @@
-Converting Tensorflow Checkpoints
-================================================
-
-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
-
-.. note::
-    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
-    available in any transformers >= 2.3.0 installation.
-
-    The documentation below reflects the **transformers-cli convert** command format.
-
-BERT
-^^^^
-
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
-
-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
-
-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
-
-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
-
-Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
-
-.. code-block:: shell
-
-   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-<<<<<<< HEAD
-   transformers-cli --model_type bert \
-=======
-   transformers-cli convert --model_type bert \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-     --config $BERT_BASE_DIR/bert_config.json \
-     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-
-You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
-
-OpenAI GPT
-^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
-
-.. code-block:: shell
-
-   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-<<<<<<< HEAD
-   transformers-cli --model_type gpt \
-=======
-   transformers-cli convert --model_type gpt \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
-
-
-OpenAI GPT-2
-^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
-
-.. code-block:: shell
-
-   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-
-<<<<<<< HEAD
-   transformers-cli --model_type gpt2 \
-=======
-   transformers-cli convert --model_type gpt2 \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT2_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
-
-Transformer-XL
-^^^^^^^^^^^^^^
-
-Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
-
-.. code-block:: shell
-
-   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-<<<<<<< HEAD
-   transformers-cli --model_type transfo_xl \
-=======
-   transformers-cli convert --model_type transfo_xl \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config TRANSFO_XL_CONFIG] \
-     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-
-
-XLNet
-^^^^^
-
-Here is an example of the conversion process for a pre-trained XLNet model:
-
-.. code-block:: shell
-
-   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-<<<<<<< HEAD
-   transformers-cli --model_type xlnet \
-=======
-   transformers-cli convert --model_type xlnet \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-     --config $TRANSFO_XL_CONFIG_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--finetuning_task_name XLNET_FINETUNED_TASK] \
-
-
-XLM
-^^^
-
-Here is an example of the conversion process for a pre-trained XLM model:
-
-.. code-block:: shell
-
-   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-
-<<<<<<< HEAD
-   transformers-cli --model_type xlm \
-=======
-   transformers-cli convert --model_type xlm \
->>>>>>> bfec203d4ed95255619e7e2f28c9040744a16232
-     --tf_checkpoint $XLM_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-    [--config XML_CONFIG] \
-    [--finetuning_task_name XML_FINETUNED_TASK]
\ No newline at end of file
diff --git a/server/transformers/docs/source/examples.md b/server/transformers/docs/source/examples.md
deleted file mode 120000
index 6fa53604d902346dcd54d7291e2f73a7ef858443..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/examples.md
+++ /dev/null
@@ -1 +0,0 @@
-../../examples/README.md
\ No newline at end of file
diff --git a/server/transformers/docs/source/glossary.rst b/server/transformers/docs/source/glossary.rst
deleted file mode 100644
index cfd8c50dd6bdb0f752b3edf8ac404518ab3e7f6f..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/glossary.rst
+++ /dev/null
@@ -1,145 +0,0 @@
-Glossary
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
-detailed here alongside usage examples.
-
-Input IDs
---------------------------
-
-The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
-numerical representations of tokens building the sequences that will be used as input by the model*.
-
-Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
-tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
-
-::
-
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
-    sequence = "A Titan RTX has 24GB of VRAM"
-
-The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
-
-::
-
-    # Continuation of the previous script
-    tokenized_sequence = tokenizer.tokenize(sequence)
-    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
-
-These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
-this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
-`huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.
-
-::
-
-    # Continuation of the previous script
-    encoded_sequence = tokenizer.encode(sequence)
-    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
-
-The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
-
-Attention mask
---------------------------
-
-The attention mask is an optional argument used when batching sequences together. This argument indicates to the
-model which tokens should be attended to, and which should not.
-
-For example, consider these two sequences:
-
-::
-
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
-    sequence_a = "This is a short sequence."
-    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
-
-    encoded_sequence_a = tokenizer.encode(sequence_a)
-    assert len(encoded_sequence_a) == 8
-
-    encoded_sequence_b = tokenizer.encode(sequence_b)
-    assert len(encoded_sequence_b) == 19
-
-These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
-sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
-the length of the first one.
-
-In the first case, the list of IDs will be extended by the padding indices:
-
-::
-
-    # Continuation of the previous script
-    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
-
-    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
-    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
-
-These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
-the position of the padded indices so that the model does not attend to them. For the
-:class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
-a padded value.
-
-The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
-
-::
-
-    # Continuation of the previous script
-    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
-
-    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-
-
-Token Type IDs
---------------------------
-
-Some models' purpose is to do sequence classification or question answering. These require two different sequences to
-be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
-tokens. For example, the BERT model builds its two sequence input as such:
-
-::
-
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-
-    # [CLS] SEQ_A [SEP] SEQ_B [SEP]
-
-    sequence_a = "HuggingFace is based in NYC"
-    sequence_b = "Where is HuggingFace based?"
-
-    encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
-    assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
-
-This is enough for some models to understand where one sequence ends and where another begins. However, other models
-such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
-the different sequences in the model.
-
-We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
-
-::
-
-    # Continuation of the previous script
-    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
-
-    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
-    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
-question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
-additional token represented by a :obj:`2`.
-
-
-Position IDs
---------------------------
-
-The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
-position of each token embedded within them, transformers are unaware of the position of each token. The position
-IDs are created for this purpose.
-
-They are an optional parameter. If no position IDs are passed to the model, they are automatically created as absolute
-positional embeddings.
-
-Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
-use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
diff --git a/server/transformers/docs/source/imgs/transformers_logo_name.png b/server/transformers/docs/source/imgs/transformers_logo_name.png
deleted file mode 100644
index 5e4c2dcf575b7f7cf7e64640dee771fc311b7068..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/transformers_logo_name.png and /dev/null differ
diff --git a/server/transformers/docs/source/imgs/warmup_constant_schedule.png b/server/transformers/docs/source/imgs/warmup_constant_schedule.png
deleted file mode 100644
index e2448e9f2c7999497d3e2d252a5dcb22b0ac7da5..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/warmup_constant_schedule.png and /dev/null differ
diff --git a/server/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png b/server/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
deleted file mode 100644
index be73605b9c080cdc7cea8b4ff7e29de90db2d9eb..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png and /dev/null differ
diff --git a/server/transformers/docs/source/imgs/warmup_cosine_schedule.png b/server/transformers/docs/source/imgs/warmup_cosine_schedule.png
deleted file mode 100644
index 6d27926ab10e9d2649ce3f28eb9656ea7cd3e9f8..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/warmup_cosine_schedule.png and /dev/null differ
diff --git a/server/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png b/server/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
deleted file mode 100644
index 71b39bffd3daccf7fc89cad77ef8e03df40bf0ab..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png and /dev/null differ
diff --git a/server/transformers/docs/source/imgs/warmup_linear_schedule.png b/server/transformers/docs/source/imgs/warmup_linear_schedule.png
deleted file mode 100644
index 4e1af31025fafbd9c6b7c74ad6c2948ca2d3ff77..0000000000000000000000000000000000000000
Binary files a/server/transformers/docs/source/imgs/warmup_linear_schedule.png and /dev/null differ
diff --git a/server/transformers/docs/source/index.rst b/server/transformers/docs/source/index.rst
deleted file mode 100644
index f9ff1a0606ce2cf1da2e9f43a8591bfd888fd7f2..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/index.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-Transformers
-================================================================================================================================================
-
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
-(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
-(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-
-This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
-
-Features
----------------------------------------------------
-
-- As easy to use as pytorch-transformers
-- As powerful and concise as Keras
-- High performance on NLU and NLG tasks
-- Low barrier to entry for educators and practitioners
-
-State-of-the-art NLP for everyone:
-
-- Deep learning researchers
-- Hands-on practitioners
-- AI/ML/NLP teachers and educators
-
-Lower compute costs, smaller carbon footprint:
-
-- Researchers can share trained models instead of always retraining
-- Practitioners can reduce compute time and production costs
-- 8 architectures with over 30 pretrained models, some in more than 100 languages
-
-Choose the right framework for every part of a model's lifetime:
-
-- Train state-of-the-art models in 3 lines of code
-- Deep interoperability between TensorFlow 2.0 and PyTorch models
-- Move a single model between TF2.0/PyTorch frameworks at will
-- Seamlessly pick the right framework for training, evaluation, production
-
-Contents
----------------------------------
-
-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
-
-1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
-9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Notes
-
-    installation
-    quickstart
-    glossary
-    pretrained_models
-    model_sharing
-    examples
-    notebooks
-    serialization
-    converting_tensorflow_models
-    migration
-    bertology
-    torchscript
-    multilingual
-    benchmarks
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Main classes
-
-    main_classes/configuration
-    main_classes/model
-    main_classes/tokenizer
-    main_classes/optimizer_schedules
-    main_classes/processors
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Package Reference
-
-    model_doc/auto
-    model_doc/bert
-    model_doc/gpt
-    model_doc/transformerxl
-    model_doc/gpt2
-    model_doc/xlm
-    model_doc/xlnet
-    model_doc/roberta
-    model_doc/distilbert
-    model_doc/ctrl
-    model_doc/camembert
-    model_doc/albert
-    model_doc/xlmroberta
-    model_doc/flaubert
\ No newline at end of file
diff --git a/server/transformers/docs/source/installation.md b/server/transformers/docs/source/installation.md
deleted file mode 100644
index f4b7781ea9a934a41172605657853fa6bc709cdc..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/installation.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# Installation
-
-Transformers is tested on Python 3.5+ and PyTorch 1.1.0
-
-## With pip
-
-PyTorch Transformers can be installed using pip as follows:
-
-``` bash
-pip install transformers
-```
-
-## From source
-
-To install from source, clone the repository and install with:
-
-``` bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install .
-```
-
-## Tests
-
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
-
-## OpenAI GPT original tokenization workflow
-
-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
-
-``` bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
-```
-
-If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
-## Note on model downloads (Continuous Integration or large-scale deployments)
-
-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
-
-## Do you want to run a Transformer model on a mobile device?
-
-You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
diff --git a/server/transformers/docs/source/main_classes/configuration.rst b/server/transformers/docs/source/main_classes/configuration.rst
deleted file mode 100644
index 2131433759c9c16801e31688ac5be37ea4c22d47..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/main_classes/configuration.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Configuration
-----------------------------------------------------
-
-The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
-
-``PretrainedConfig``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PretrainedConfig
-    :members:
diff --git a/server/transformers/docs/source/main_classes/model.rst b/server/transformers/docs/source/main_classes/model.rst
deleted file mode 100644
index 6e3da45bc2dfa3089e2345b814776ad6790576d1..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/main_classes/model.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Models
-----------------------------------------------------
-
-The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
-
-``PreTrainedModel`` also implements a few methods which are common among all the models to:
-
-- resize the input token embeddings when new tokens are added to the vocabulary
-- prune the attention heads of the model.
-
-``PreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PreTrainedModel
-    :members:
-
-``TFPreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFPreTrainedModel
-    :members:
diff --git a/server/transformers/docs/source/main_classes/optimizer_schedules.rst b/server/transformers/docs/source/main_classes/optimizer_schedules.rst
deleted file mode 100644
index ec4998389b2f37ae89240d56f3a7b325f9e78bd7..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/main_classes/optimizer_schedules.rst
+++ /dev/null
@@ -1,72 +0,0 @@
-Optimizer
-----------------------------------------------------
-
-The ``.optimization`` module provides:
-
-- an optimizer with weight decay fixed that can be used to fine-tuned models, and
-- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
-- a gradient accumulation class to accumulate the gradients of multiple batches
-
-``AdamW``
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AdamW
-    :members:
-
-``AdamWeightDecay``
-~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AdamWeightDecay
-    :members:
-
-.. autofunction:: transformers.create_optimizer
-
-Schedules
-----------------------------------------------------
-
-Learning Rate Schedules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autofunction:: transformers.get_constant_schedule
-
-
-.. autofunction:: transformers.get_constant_schedule_with_warmup
-
-.. image:: /imgs/warmup_constant_schedule.png
-    :target: /imgs/warmup_constant_schedule.png
-    :alt:
-
-
-.. autofunction:: transformers.get_cosine_schedule_with_warmup
-
-.. image:: /imgs/warmup_cosine_schedule.png
-    :target: /imgs/warmup_cosine_schedule.png
-    :alt:
-
-
-.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
-
-.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
-    :target: /imgs/warmup_cosine_hard_restarts_schedule.png
-    :alt:
-
-
-
-.. autofunction:: transformers.get_linear_schedule_with_warmup
-
-.. image:: /imgs/warmup_linear_schedule.png
-    :target: /imgs/warmup_linear_schedule.png
-    :alt:
-
-``Warmup``
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.WarmUp
-    :members:
-
-Gradient Strategies
-----------------------------------------------------
-
-``GradientAccumulator``
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GradientAccumulator
diff --git a/server/transformers/docs/source/main_classes/processors.rst b/server/transformers/docs/source/main_classes/processors.rst
deleted file mode 100644
index 46839ce67e6f842e95b83c0086ff82a77a01b60a..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/main_classes/processors.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-Processors
-----------------------------------------------------
-
-This library includes processors for several traditional tasks. These processors can be used to process a dataset into
-examples that can be fed to a model.
-
-Processors
-~~~~~~~~~~~~~~~~~~~~~
-
-All processors follow the same architecture which is that of the
-:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
-of :class:`~transformers.data.processors.utils.InputExample`. These
-:class:`~transformers.data.processors.utils.InputExample` can be converted to
-:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
-
-.. autoclass:: transformers.data.processors.utils.DataProcessor
-    :members:
-
-
-.. autoclass:: transformers.data.processors.utils.InputExample
-    :members:
-
-
-.. autoclass:: transformers.data.processors.utils.InputFeatures
-    :members:
-
-
-GLUE
-~~~~~~~~~~~~~~~~~~~~~
-
-`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
-the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
-`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
-
-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
-CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
-
-Those processors are:
-    - :class:`~transformers.data.processors.utils.MrpcProcessor`
-    - :class:`~transformers.data.processors.utils.MnliProcessor`
-    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
-    - :class:`~transformers.data.processors.utils.Sst2Processor`
-    - :class:`~transformers.data.processors.utils.StsbProcessor`
-    - :class:`~transformers.data.processors.utils.QqpProcessor`
-    - :class:`~transformers.data.processors.utils.QnliProcessor`
-    - :class:`~transformers.data.processors.utils.RteProcessor`
-    - :class:`~transformers.data.processors.utils.WnliProcessor`
-
-Additionally, the following method  can be used to load values from a data file and convert them to a list of
-:class:`~transformers.data.processors.utils.InputExample`.
-
-.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
-
-Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
-
-
-XNLI
-~~~~~~~~~~~~~~~~~~~~~
-
-`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
-the quality of cross-lingual text representations. 
-XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment 
-annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
-
-It was released together with the paper
-`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
-
-This library hosts the processor to load the XNLI data:
-    - :class:`~transformers.data.processors.utils.XnliProcessor`
-
-Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
-
-An example using these processors is given in the
-`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
-
-
-SQuAD
-~~~~~~~~~~~~~~~~~~~~~
-
-`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
-the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
-`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside 
-the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
-
-This library hosts a processor for each of the two versions:
-
-Processors
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Those processors are:
-    - :class:`~transformers.data.processors.utils.SquadV1Processor`
-    - :class:`~transformers.data.processors.utils.SquadV2Processor`
-
-They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
-
-.. autoclass:: transformers.data.processors.squad.SquadProcessor
-    :members:
-
-Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
-that can be used as model inputs.
-
-.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
-
-These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
-Examples are given below.
-
-
-Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^
-Here is an example using the processors as well as the conversion method using data files:
-
-Example::
-
-    # Loading a V2 processor
-    processor = SquadV2Processor()
-    examples = processor.get_dev_examples(squad_v2_data_dir)
-
-    # Loading a V1 processor
-    processor = SquadV1Processor()
-    examples = processor.get_dev_examples(squad_v1_data_dir)
-
-    features = squad_convert_examples_to_features( 
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=max_query_length,
-        is_training=not evaluate,
-    )
-
-Using `tensorflow_datasets` is as easy as using a data file:
-
-Example::
-
-    # tensorflow_datasets only handle Squad V1.
-    tfds_examples = tfds.load("squad")
-    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-
-    features = squad_convert_examples_to_features( 
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=max_query_length,
-        is_training=not evaluate,
-    )
-
-
-Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
diff --git a/server/transformers/docs/source/main_classes/tokenizer.rst b/server/transformers/docs/source/main_classes/tokenizer.rst
deleted file mode 100644
index c33eb458292716d08ff2a10cccb492107c77a9b0..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/main_classes/tokenizer.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-Tokenizer
-----------------------------------------------------
-
-The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
-
-``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
-
-- tokenizing, converting tokens to ids and back and encoding/decoding,
-- adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
-- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
-
-``PreTrainedTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PreTrainedTokenizer
-    :members:
diff --git a/server/transformers/docs/source/migration.md b/server/transformers/docs/source/migration.md
deleted file mode 100644
index f50d1dff0a8e2a6205c66a6a012d17fb98b19f38..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/migration.md
+++ /dev/null
@@ -1,109 +0,0 @@
-# Migrating from pytorch-pretrained-bert
-
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
-
-### Models always output `tuples`
-
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
-
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-
-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
-
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
-
-# Now just use this line in transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
-
-# In transformers you can also have access to the logits:
-loss, logits = outputs[:2]
-
-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
-
-### Serialization
-
-Breaking change in the `from_pretrained()`method:
-
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-
-2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
-
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
-
-Here is an example:
-
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
-```
-
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
-- it only implements weights decay correction,
-- schedules are now externals (see below),
-- gradient clipping is now also external (see below).
-
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
-
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
-
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
-
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_training_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    optimizer.step()
-
-### In Transformers, optimizer and schedules are splitted and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    optimizer.step()
-    scheduler.step()
-```
diff --git a/server/transformers/docs/source/model_doc/albert.rst b/server/transformers/docs/source/model_doc/albert.rst
deleted file mode 100644
index 06a9b5bfd50b0c7aef601b6693572bef8bc20b82..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/albert.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-ALBERT
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
-by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
-two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
-
-- Splitting the embedding matrix into two smaller matrices
-- Using repeating layers split among groups
-
-The abstract from the paper is the following:
-
-*Increasing model size when pretraining natural language representations often results in improved performance on
-downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
-longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
-techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
-that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
-self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
-tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
-RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
-
-Tips:
-
-- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
-- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
-  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
-  number of (repeating) layers.
-
-AlbertConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertConfig
-    :members:
-
-
-AlbertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertTokenizer
-    :members:
-
-
-AlbertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertModel
-    :members:
-
-
-AlbertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertForMaskedLM
-    :members:
-
-
-AlbertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertForSequenceClassification
-    :members:
-
-
-AlbertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AlbertForQuestionAnswering
-    :members:
-
-
-TFAlbertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFAlbertModel
-    :members:
-
-
-TFAlbertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFAlbertForMaskedLM
-    :members:
-
-
-TFAlbertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFAlbertForSequenceClassification
-    :members:
diff --git a/server/transformers/docs/source/model_doc/auto.rst b/server/transformers/docs/source/model_doc/auto.rst
deleted file mode 100644
index 541d03a8e588ecec7fa483c5f48243f49a76d6cc..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/auto.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-AutoModels
------------
-
-In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
-
-AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
-
-Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
-
-
-``AutoConfig``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoConfig
-    :members:
-
-
-``AutoTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoTokenizer
-    :members:
-
-
-``AutoModel``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModel
-    :members:
-
-
-``AutoModelForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModelForPreTraining
-    :members:
-
-
-``AutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModelWithLMHead
-    :members:
-
-
-``AutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModelForSequenceClassification
-    :members:
-
-
-``AutoModelForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModelForQuestionAnswering
-    :members:
-
-
-``AutoModelForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoModelForTokenClassification
-    :members:
-
diff --git a/server/transformers/docs/source/model_doc/bert.rst b/server/transformers/docs/source/model_doc/bert.rst
deleted file mode 100644
index 5e785eed1c9f3fff0063b4d881b86535423093a4..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/bert.rst
+++ /dev/null
@@ -1,162 +0,0 @@
-BERT
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
-by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-pre-trained using a combination of masked language modeling objective and next sentence prediction
-on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-The abstract from the paper is the following:
-
-*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
-from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
-representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
-the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
-for a wide range of tasks, such as question answering and language inference, without substantial task-specific
-architecture modifications.*
-
-*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
-language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
-accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
-improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
-
-Tips:
-
-- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
-- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
-  tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
-  modeling (CLM) objective are better in that regard.
-- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
-  approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
-  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
-  the [CLS] token.
-
-BertConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertConfig
-    :members:
-
-
-BertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertTokenizer
-    :members:
-
-
-BertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertModel
-    :members:
-
-
-BertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForPreTraining
-    :members:
-
-
-BertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMaskedLM
-    :members:
-
-
-BertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForNextSentencePrediction
-    :members:
-
-
-BertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForSequenceClassification
-    :members:
-
-
-BertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMultipleChoice
-    :members:
-
-
-BertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForTokenClassification
-    :members:
-
-
-BertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForQuestionAnswering
-    :members:
-
-
-TFBertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertModel
-    :members:
-
-
-TFBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForPreTraining
-    :members:
-
-
-TFBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForMaskedLM
-    :members:
-
-
-TFBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForNextSentencePrediction
-    :members:
-
-
-TFBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForSequenceClassification
-    :members:
-
-
-TFBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForMultipleChoice
-    :members:
-
-
-TFBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForTokenClassification
-    :members:
-
-
-TFBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFBertForQuestionAnswering
-    :members:
-
diff --git a/server/transformers/docs/source/model_doc/camembert.rst b/server/transformers/docs/source/model_doc/camembert.rst
deleted file mode 100644
index 611d930d6ed8fd16c0b4b6d1d0350683f7778cd1..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/camembert.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-CamemBERT
-----------------------------------------------------
-
-The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
-by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
-Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
-trained on 138GB of French text.
-
-The abstract from the paper is the following:
-
-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
-most available models have either been trained on English data or on the concatenation of data in multiple
-languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
-to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
-Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
-downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
-language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
-pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
-
-Tips:
-
-- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
-  examples as well as the information relative to the inputs and outputs.
-
-CamembertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertConfig
-    :members:
-
-
-CamembertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertTokenizer
-    :members:
-
-
-CamembertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertModel
-    :members:
-
-
-CamembertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertForMaskedLM
-    :members:
-
-
-CamembertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertForSequenceClassification
-    :members:
-
-
-CamembertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertForMultipleChoice
-    :members:
-
-
-CamembertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CamembertForTokenClassification
-    :members:
-
-
-TFCamembertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCamembertModel
-    :members:
-
-
-TFCamembertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCamembertForMaskedLM
-    :members:
-
-
-TFCamembertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCamembertForSequenceClassification
-    :members:
-
-
-TFCamembertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCamembertForTokenClassification
-    :members:
diff --git a/server/transformers/docs/source/model_doc/ctrl.rst b/server/transformers/docs/source/model_doc/ctrl.rst
deleted file mode 100644
index a8a04837d75ea068286cb37ab1f3b02ffb4a1ad4..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/ctrl.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-CTRL
-----------------------------------------------------
-
-CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
-by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
-corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
-
-The abstract from the paper is the following:
-
-*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
-aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
-trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
-derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
-while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
-the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
-of data via model-based source attribution.*
-
-Tips:
-
-- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
-  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__
-  for more information.
-- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
-- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
-- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
-  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
-  of this argument.
-
-
-CTRLConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CTRLConfig
-    :members:
-
-
-CTRLTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CTRLTokenizer
-    :members:
-
-
-CTRLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CTRLModel
-    :members:
-
-
-CTRLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CTRLLMHeadModel
-    :members:
-
-
-TFCTRLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCTRLModel
-    :members:
-
-
-TFCTRLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFCTRLLMHeadModel
-    :members:
-
diff --git a/server/transformers/docs/source/model_doc/distilbert.rst b/server/transformers/docs/source/model_doc/distilbert.rst
deleted file mode 100644
index 81d8086c151fd8b864c7dad409da6c970ec39790..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/distilbert.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-DistilBERT
-----------------------------------------------------
-
-The DistilBERT model was proposed in the blog post
-`Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
-and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
-DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less
-parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
-the GLUE language understanding benchmark.
-
-The abstract from the paper is the following:
-
-*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
-operating these large models in on-the-edge and/or under constrained computational training or inference budgets
-remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
-model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
-counterparts. While most prior work investigated the use of distillation for building task-specific models, we
-leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
-BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
-the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
-modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
-and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
-on-device study.*
-
-Tips:
-
-- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
-- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
-
-
-DistilBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertConfig
-    :members:
-
-
-DistilBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertTokenizer
-    :members:
-
-
-DistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertModel
-    :members:
-
-
-DistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForMaskedLM
-    :members:
-
-
-DistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForSequenceClassification
-    :members:
-
-
-DistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForQuestionAnswering
-    :members:
-
-TFDistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFDistilBertModel
-    :members:
-
-
-TFDistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFDistilBertForMaskedLM
-    :members:
-
-
-TFDistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFDistilBertForSequenceClassification
-    :members:
-
-
-TFDistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFDistilBertForQuestionAnswering
-    :members:
diff --git a/server/transformers/docs/source/model_doc/flaubert.rst b/server/transformers/docs/source/model_doc/flaubert.rst
deleted file mode 100644
index d0211306eed90c781f418327a9ebe5feb359624b..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/flaubert.rst
+++ /dev/null
@@ -1,72 +0,0 @@
-FlauBERT
-----------------------------------------------------
-
-The FlauBERT model was proposed in the paper
-`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
-It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
-
-The abstract from the paper is the following:
-
-*Language models have become a key step to achieve state-of-the art results in many different Natural Language
-Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
-way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
-contextualization at the sentence level. This has been widely demonstrated for English using contextualized
-representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
-al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
-and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
-for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
-classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
-of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
-evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
-to the research community for further reproducible experiments in French NLP.*
-
-
-FlaubertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertConfig
-    :members:
-
-
-FlaubertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertTokenizer
-    :members:
-
-
-FlaubertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertModel
-    :members:
-
-
-FlaubertWithLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertWithLMHeadModel
-    :members:
-
-
-FlaubertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertForSequenceClassification
-    :members:
-
-
-FlaubertForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
-    :members:
-
-
-FlaubertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaubertForQuestionAnswering
-    :members:
-
-
diff --git a/server/transformers/docs/source/model_doc/gpt.rst b/server/transformers/docs/source/model_doc/gpt.rst
deleted file mode 100644
index 9604b39ceae0a435deab58df5ed6648f588c27f5..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/gpt.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-OpenAI GPT
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training <https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
-by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional)
-transformer pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
-
-The abstract from the paper is the following:
-
-*Natural language understanding comprises a wide range of diverse tasks such
-as textual entailment, question answering, semantic similarity assessment, and
-document classification. Although large unlabeled text corpora are abundant,
-labeled data for learning these specific tasks is scarce, making it challenging for
-discriminatively trained models to perform adequately. We demonstrate that large
-gains on these tasks can be realized by generative pre-training of a language model
-on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
-specific task. In contrast to previous approaches, we make use of task-aware input
-transformations during fine-tuning to achieve effective transfer while requiring
-minimal changes to the model architecture. We demonstrate the effectiveness of
-our approach on a wide range of benchmarks for natural language understanding.
-Our general task-agnostic model outperforms discriminatively trained models that
-use architectures specifically crafted for each task, significantly improving upon the
-state of the art in 9 out of the 12 tasks studied.*
-
-Tips:
-
-- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
-- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
-
-`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT is one of them.
-
-OpenAIGPTConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.OpenAIGPTConfig
-    :members:
-
-
-OpenAIGPTTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.OpenAIGPTTokenizer
-    :members:
-
-
-OpenAIGPTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.OpenAIGPTModel
-    :members:
-
-
-OpenAIGPTLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.OpenAIGPTLMHeadModel
-    :members:
-
-
-OpenAIGPTDoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
-    :members:
-
-
-TFOpenAIGPTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFOpenAIGPTModel
-    :members:
-
-
-TFOpenAIGPTLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
-    :members:
-
-
-TFOpenAIGPTDoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
-    :members:
diff --git a/server/transformers/docs/source/model_doc/gpt2.rst b/server/transformers/docs/source/model_doc/gpt2.rst
deleted file mode 100644
index 54ef3cea08c3d864d23a0bf567789ad899de7081..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/gpt2.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-OpenAI GPT2
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-OpenAI GPT-2 model was proposed in
-`Language Models are Unsupervised Multitask Learners`_
-by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
-corpus of ~40 GB of text data.
-
-The abstract from the paper is the following:
-
-*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1]
-of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous
-words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring
-demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X
-the parameters and trained on more than 10X the amount of data.*
-
-Tips:
-
-- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
-- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
-- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
-  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
-  of this argument.
-
-`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
-different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.
-
-
-GPT2Config
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Config
-    :members:
-
-
-GPT2Tokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Tokenizer
-    :members:
-
-
-GPT2Model
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Model
-    :members:
-
-
-GPT2LMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2LMHeadModel
-    :members:
-
-
-GPT2DoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2DoubleHeadsModel
-    :members:
-
-
-TFGPT2Model
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFGPT2Model
-    :members:
-
-
-TFGPT2LMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFGPT2LMHeadModel
-    :members:
-
-
-TFGPT2DoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFGPT2DoubleHeadsModel
-    :members:
diff --git a/server/transformers/docs/source/model_doc/roberta.rst b/server/transformers/docs/source/model_doc/roberta.rst
deleted file mode 100644
index d3276d55e0bbfa233629d45fe4de8da4eed09331..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/roberta.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-RoBERTa
-----------------------------------------------------
-
-The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
-by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
-Veselin Stoyanov. It is based on Google's BERT model released in 2018.
-
-It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
-objective and training with much larger mini-batches and learning rates.
-
-The abstract from the paper is the following:
-
-*Language model pretraining has led to significant performance gains but careful comparison between different
-approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
-and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
-study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
-training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of
-every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These
-results highlight the importance of previously overlooked design choices, and raise questions about the source
-of recently reported improvements. We release our models and code.*
-
-Tips:
-
-- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
-  setup for Roberta pretrained models.
-- `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.
-
-RobertaConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaConfig
-    :members:
-
-
-RobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaTokenizer
-    :members:
-
-
-RobertaModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaModel
-    :members:
-
-
-RobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForMaskedLM
-    :members:
-
-
-RobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForSequenceClassification
-    :members:
-
-
-RobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForTokenClassification
-    :members:
-
-TFRobertaModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaModel
-    :members:
-
-
-TFRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForMaskedLM
-    :members:
-
-
-TFRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForSequenceClassification
-    :members:
-
-
-TFRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRobertaForTokenClassification
-    :members:
diff --git a/server/transformers/docs/source/model_doc/transformerxl.rst b/server/transformers/docs/source/model_doc/transformerxl.rst
deleted file mode 100644
index 5240df3df4aec29fefd7032e39c0bca78a4f379e..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/transformerxl.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-Transformer XL
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-The Transformer-XL model was proposed in
-`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__
-by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
-previously computed hidden-states to attend to longer context (memory).
-This model also uses adaptive softmax inputs and outputs (tied).
-
-The abstract from the paper is the following:
-
-*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
-setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
-beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
-a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
-the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
-450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
-to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
-of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
-Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
-coherent, novel text articles with thousands of tokens.*
-
-Tips:
-
-- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
-  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
-- Transformer-XL is one of the few models that has no sequence length limit.
-
-
-TransfoXLConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLConfig
-    :members:
-
-
-TransfoXLTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLTokenizer
-    :members:
-
-
-TransfoXLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLModel
-    :members:
-
-
-TransfoXLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLLMHeadModel
-    :members:
-
-
-TFTransfoXLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTransfoXLModel
-    :members:
-
-
-TFTransfoXLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTransfoXLLMHeadModel
-    :members:
diff --git a/server/transformers/docs/source/model_doc/xlm.rst b/server/transformers/docs/source/model_doc/xlm.rst
deleted file mode 100644
index 73466937523efabbc60c821319222c81b817e6df..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/xlm.rst
+++ /dev/null
@@ -1,106 +0,0 @@
-XLM
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-The XLM model was proposed in `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_
-by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
-
-- a causal language modeling (CLM) objective (next token prediction),
-- a masked language modeling (MLM) objective (Bert-like), or
-- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
-
-The abstract from the paper is the following:
-
-*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
-In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining.
-We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
-data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
-state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI,
-our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation,
-we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On
-supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming
-the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
-
-Tips:
-
-- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
-  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
-- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
-  `multi-lingual <../multilingual.html>`__ page for more information.
-
-
-XLMConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMConfig
-    :members:
-
-XLMTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMTokenizer
-    :members:
-
-XLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMModel
-    :members:
-
-
-XLMWithLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMWithLMHeadModel
-    :members:
-
-
-XLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForSequenceClassification
-    :members:
-
-
-XLMForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForQuestionAnsweringSimple
-    :members:
-
-
-XLMForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMForQuestionAnswering
-    :members:
-
-
-TFXLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMModel
-    :members:
-
-
-TFXLMWithLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMWithLMHeadModel
-    :members:
-
-
-TFXLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForSequenceClassification
-    :members:
-
-
-TFXLMForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
-    :members:
diff --git a/server/transformers/docs/source/model_doc/xlmroberta.rst b/server/transformers/docs/source/model_doc/xlmroberta.rst
deleted file mode 100644
index 8ddb38b1c2159334e4878cdcd0da9c925d7e7aa5..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/xlmroberta.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-XLM-RoBERTa
-------------------------------------------
-
-The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
-by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
-Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
-It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
-
-The abstract from the paper is the following:
-
-*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for
-a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
-languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
-outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy
-on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
-low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model.
-We also present a detailed empirical evaluation of the key factors that are required to achieve these gains,
-including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and
-low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling
-without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE
-and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
-
-Tips:
-
-- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
-  examples as well as the information relative to the inputs and outputs.
-
-XLMRobertaConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaConfig
-    :members:
-
-
-XLMRobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaTokenizer
-    :members:
-
-
-XLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModel
-    :members:
-
-
-XLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMaskedLM
-    :members:
-
-
-XLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForSequenceClassification
-    :members:
-
-
-XLMRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMultipleChoice
-    :members:
-
-
-XLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForTokenClassification
-    :members:
-
-
-TFXLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaModel
-    :members:
-
-
-TFXLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForMaskedLM
-    :members:
-
-
-TFXLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForSequenceClassification
-    :members:
-
-
-TFXLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLMRobertaForTokenClassification
-    :members:
diff --git a/server/transformers/docs/source/model_doc/xlnet.rst b/server/transformers/docs/source/model_doc/xlnet.rst
deleted file mode 100644
index 0f8c61098c60bd2195f554dc1d8de8fe164428a7..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_doc/xlnet.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-XLNet
-----------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~
-
-The XLNet model was proposed in `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_
-by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
-to learn bidirectional contexts by maximizing the expected likelihood over all permutations
-of the input sequence factorization order.
-
-The abstract from the paper is the following:
-
-*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
-better performance than pretraining approaches based on autoregressive language modeling. However, relying on
-corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
-pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
-pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over
-all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
-formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model,
-into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by
-a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
-
-Tips:
-
-- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
-- Due to the difficulty of training a fully auto-regressive model over various factorization order,
-  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
-  with the `target_mapping` input.
-- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
-  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
-- XLNet is one of the few models that has no sequence length limit.
-
-
-XLNetConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetConfig
-    :members:
-
-
-XLNetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetTokenizer
-    :members:
-
-
-XLNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetModel
-    :members:
-
-
-XLNetLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetLMHeadModel
-    :members:
-
-
-XLNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForSequenceClassification
-    :members:
-
-
-XLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForTokenClassification
-    :members:
-
-
-XLNetForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForMultipleChoice
-    :members:
-
-
-XLNetForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
-    :members:
-
-
-XLNetForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForQuestionAnswering
-    :members:
-
-
-TFXLNetModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetModel
-    :members:
-
-
-TFXLNetLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetLMHeadModel
-    :members:
-
-
-TFXLNetForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForSequenceClassification
-    :members:
-
-
-TFXLNetForQuestionAnsweringSimple
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
-    :members:
diff --git a/server/transformers/docs/source/model_sharing.md b/server/transformers/docs/source/model_sharing.md
deleted file mode 100644
index 03ea4c3d8060cf64c03771071e4d8cda58bfc1b4..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/model_sharing.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Model upload and sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
-```python
-"username/pretrained_model"
-```
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
-model = AutoModel.from_pretrained("username/pretrained_model")
-```
-
-Finally, list all your files on S3:
-```shell
-transformers-cli s3 ls
-# List all your S3 objects.
-```
-
-You can also delete files:
-
-```shell
-transformers-cli s3 rm …
-```
\ No newline at end of file
diff --git a/server/transformers/docs/source/multilingual.rst b/server/transformers/docs/source/multilingual.rst
deleted file mode 100644
index f6f72b2434e8480874c4e13f88b6ab156b326ea7..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/multilingual.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-Multi-lingual models
-================================================
-
-Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
-multi-lingual models are available and have a different mechanisms than mono-lingual models.
-This page details the usage of these models.
-
-The two models that currently support multiple languages are BERT and XLM.
-
-XLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can
-be split in two categories: the checkpoints that make use of language embeddings, and those that don't
-
-XLM & Language Embeddings
-------------------------------------------------
-
-This section concerns the following checkpoints:
-
-- ``xlm-mlm-ende-1024`` (Masked language modeling, English-German)
-- ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French)
-- ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian)
-- ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages)
-- ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages)
-- ``xlm-clm-enfr-1024`` (Causal language modeling, English-French)
-- ``xlm-clm-ende-1024`` (Causal language modeling, English-German)
-
-These checkpoints require language embeddings that will specify the language used at inference time. These language
-embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
-these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
-from the tokenizer.
-
-Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):
-
-
-.. code-block::
-
-    import torch
-    from transformers import XLMTokenizer, XLMWithLMHeadModel
-
-    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
-
-
-The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
-``lang2id`` attribute:
-
-.. code-block::
-
-    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
-
-
-These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
-
-.. code-block::
-
-    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
-
-
-We should now define the language embedding by using the previously defined language id. We want to create a tensor
-filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
-
-.. code-block::
-
-    language_id = tokenizer.lang2id['en']  # 0
-    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
-    # We reshape it to be of size (batch_size, sequence_length)
-    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
-
-
-You can then feed it all as input to your model:
-
-.. code-block::
-
-    outputs = model(input_ids, langs=langs)
-
-
-The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
-can generate text using the CLM checkpoints from XLM, using the language embeddings.
-
-XLM without Language Embeddings
-------------------------------------------------
-
-This section concerns the following checkpoints:
-
-- ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
-- ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)
-
-These checkpoints do not require language embeddings at inference time. These models are used to have generic
-sentence representations, differently from previously-mentioned XLM checkpoints.
-
-
-BERT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-BERT has two checkpoints that can be used for multi-lingual tasks:
-
-- ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
-- ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)
-
-These checkpoints do not require language embeddings at inference time. They should identify the language
-used in the context and infer accordingly.
\ No newline at end of file
diff --git a/server/transformers/docs/source/notebooks.rst b/server/transformers/docs/source/notebooks.rst
deleted file mode 100644
index fe669e8e47f8bf76fa26380e46ea74d951e135be..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/notebooks.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-Notebooks
-================================================
-
-We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-
-*
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-*
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-*
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
diff --git a/server/transformers/docs/source/pretrained_models.rst b/server/transformers/docs/source/pretrained_models.rst
deleted file mode 100644
index e124e414c91a62485712ed08427de24e05bbd861..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/pretrained_models.rst
+++ /dev/null
@@ -1,272 +0,0 @@
-Pretrained models
-================================================
-
-Here is the full list of the currently provided pretrained models together with a short presentation of each model.
-
-For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.
-
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
-+===================+============================================================+=======================================================================================================================================+
-| BERT              | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased English text.                                                                                                      |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on cased English text.                                                                                                      |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
-|                   |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
-|                   |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
-|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
-|                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
-|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
-|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
-|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
-|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
-|                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | OpenAI GPT English model                                                                                                            |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| GPT-2             | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
-|                   |                                                            | | OpenAI GPT-2 English model                                                                                                          |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
-|                   |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
-|                   |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
-|                   |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
-|                   |                                                            | | English model trained on wikitext-103                                                                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLNet             | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | XLNet English model                                                                                                                 |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | XLNet Large English model                                                                                                           |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM English model                                                                                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
-|                   |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
-|                   |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
-|                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
-|                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
-|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
-|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
-|                   |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
-|                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| CamemBERT         | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
-|                   |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
-|                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
-|                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
-|                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
-|                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
-|                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
-|                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
-|                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
-|                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
-|                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
-|                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
-|                   |                                                            | | FlauBERT small architecture                                                                                                         |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
-|                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
-|                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
-|                   |                                                            | | FlauBERT large architecture                                                                                                         |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
-
-.. <https://huggingface.co/transformers/examples.html>`__
diff --git a/server/transformers/docs/source/quickstart.md b/server/transformers/docs/source/quickstart.md
deleted file mode 100644
index 60e2cf3fd84193365abb92432b403386b188f1ac..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/quickstart.md
+++ /dev/null
@@ -1,315 +0,0 @@
-# Quickstart
-
-## Philosophy
-
-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
-
-The library was designed with two strong goals in mind:
-
-- be as easy and fast to use as possible:
-
-  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
-  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
-  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
-
-- provide state-of-the-art models with performances as close as possible to the original models:
-
-  - we provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture,
-  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
-
-A few other goals:
-
-- expose the models' internals as consistently as possible:
-
-  - we give access, using a single API to the full hidden-states and attention weights,
-  - tokenizer and base model's API are standardized to easily switch between models.
-
-- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
-
-  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
-  - simple ways to mask and prune transformer heads.
-
-## Main concepts
-
-The library is build around three type of classes for each models:
-
-- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
-- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
-- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
-
-All these classes can be instantiated from pretrained instances and saved locally using two methods:
-
-- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
-- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
-
-We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
-
-- the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
-- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
-
-## Quick tour: Usage
-
-Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
-
-See full API reference for examples for each model class.
-
-### BERT example
-
-Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
-
-```python
-import torch
-from transformers import BertTokenizer, BertModel, BertForMaskedLM
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Tokenize input
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = tokenizer.tokenize(text)
-
-# Mask a token that we will try to predict back with `BertForMaskedLM`
-masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
-assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
-
-# Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-```
-
-Let's see how we can use `BertModel` to encode our inputs in hidden-states:
-
-```python
-# Load pre-trained model (weights)
-model = BertModel.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the hidden state of the last layer of the Bert model
-    encoded_layers = outputs[0]
-# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
-assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
-```
-
-And how to use `BertForMaskedLM` to predict a masked token:
-
-```python
-# Load pre-trained model (weights)
-model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'henson'
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'henson'
-```
-
-### OpenAI GPT-2
-
-Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
-
-First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
-
-```python
-import torch
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-
-# Encode a text inputs
-text = "Who was Jim Henson ? Jim Henson was a"
-indexed_tokens = tokenizer.encode(text)
-
-# Convert indexed tokens in a PyTorch tensor
-tokens_tensor = torch.tensor([indexed_tokens])
-```
-
-Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
-
-```python
-# Load pre-trained model (weights)
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor)
-    predictions = outputs[0]
-
-# get the predicted next sub-word (in our case, the word 'man')
-predicted_index = torch.argmax(predictions[0, -1, :]).item()
-predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
-assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
-```
-
-Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
-
-#### Using the past
-
-GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
-
-Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
-
-```python
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-import torch
-
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-generated = tokenizer.encode("The Manhattan bridge")
-context = torch.tensor([generated])
-past = None
-
-for i in range(100):
-    print(i)
-    output, past = model(context, past=past)
-    token = torch.argmax(output[0, :])
-
-    generated += [token.tolist()]
-    context = token.unsqueeze(0)
-
-sequence = tokenizer.decode(generated)
-
-print(sequence)
-```
-
-The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
-
-### Model2Model example
-
-Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
-
-```python
-import torch
-from transformers import BertTokenizer, Model2Model
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Encode the input to the encoder (the question)
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-
-# Encode the input to the decoder (the answer)
-answer = "Jim Henson was a puppeteer"
-encoded_answer = tokenizer.encode(answer)
-
-# Convert inputs to PyTorch tensors
-question_tensor = torch.tensor([encoded_question])
-answer_tensor = torch.tensor([encoded_answer])
-```
-
-Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
-
-```python
-# In order to compute the loss we need to provide language model
-# labels (the token ids that the model should have produced) to
-# the decoder.
-lm_labels =  encoded_answer
-labels_tensor = torch.tensor([lm_labels])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-labels_tensor = labels_tensor.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the value of the LM loss 
-    lm_loss = outputs[0]
-```
-
-This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
-
-```python
-# Let's re-use the previous question
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-question_tensor = torch.tensor([encoded_question])
-
-# This time we try to generate the answer, so we start with an empty sequence
-answer = "[CLS]"
-encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
-answer_tensor = torch.tensor([encoded_answer])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('fine-tuned-weights')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = encoded_question.to('cuda')
-answer_tensor = encoded_answer.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(question_tensor, answer_tensor)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'jim'
-predicted_index = torch.argmax(predictions[0, -1]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'jim'
-```
diff --git a/server/transformers/docs/source/serialization.rst b/server/transformers/docs/source/serialization.rst
deleted file mode 100644
index d2862dc0b50589a84f3c354c5fc3fdc2638ed010..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/serialization.rst
+++ /dev/null
@@ -1,190 +0,0 @@
-Loading Google AI or OpenAI pre-trained weights or PyTorch dump
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``from_pretrained()`` method
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
-
-.. code-block:: python
-
-   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
-
-where
-
-
-* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
-*
-  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
-
-
-  *
-    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
-
-
-    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
-    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
-    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
-
-  *
-    a path or url to a pretrained model archive containing:
-
-
-    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
-    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
-
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
-
-*
-  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
-
-* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
-
-``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
-
-When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
-
-Examples:
-
-.. code-block:: python
-
-   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
-   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-   # OpenAI GPT
-   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-   model = OpenAIGPTModel.from_pretrained('openai-gpt')
-
-   # Transformer-XL
-   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-
-   # OpenAI GPT-2
-   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-   model = GPT2Model.from_pretrained('gpt2')
-
-Cache directory
-~~~~~~~~~~~~~~~
-
-``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
-
-
-* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
-* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
-* PyTorch cache home + ``/pytorch_pretrained_bert/``
-  where PyTorch cache home is defined by (in this order):
-
-  * shell environment variable ``ENV_TORCH_HOME``
-  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
-  * default: ``~/.cache/torch/``
-
-Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
-
-You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
-
-Serialization best-practices
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
-There are three types of files you need to save to be able to reload a fine-tuned model:
-
-
-* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
-* the configuration file of the model which is saved as a JSON file, and
-* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
-
-The *default filenames* of these files are as follow:
-
-
-* the model weights file: ``pytorch_model.bin``\ ,
-* the configuration file: ``config.json``\ ,
-* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
-* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
-
-**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
-
-Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
-
-.. code-block:: python
-
-   from transformers import WEIGHTS_NAME, CONFIG_NAME
-
-   output_dir = "./models/"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   # If we save using the predefined names, we can load using `from_pretrained`
-   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-   output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_dir)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # Example for a Bert model
-   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
-   # Example for a GPT model
-   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-
-Here is another way you can save and reload the model if you want to use specific paths for each type of files:
-
-.. code-block:: python
-
-   output_model_file = "./models/my_own_model_file.bin"
-   output_config_file = "./models/my_own_config_file.bin"
-   output_vocab_file = "./models/my_own_vocab_file.bin"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_vocab_file)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-   # Here is how to do it in this situation:
-
-   # Example for a Bert model
-   config = BertConfig.from_json_file(output_config_file)
-   model = BertForQuestionAnswering(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
-
-   # Example for a GPT model
-   config = OpenAIGPTConfig.from_json_file(output_config_file)
-   model = OpenAIGPTDoubleHeadsModel(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-
diff --git a/server/transformers/docs/source/torchscript.rst b/server/transformers/docs/source/torchscript.rst
deleted file mode 100644
index fd1eeb53635ff30bac4597d2e0308b9443c6afbe..0000000000000000000000000000000000000000
--- a/server/transformers/docs/source/torchscript.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-TorchScript
-================================================
-
-.. note::
-    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
-    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
-    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
-    with compiled TorchScript.
-
-
-According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
-Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
-their model to be re-used in other programs, such as efficiency-oriented C++ programs.
-
-We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
-be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
-they can be exported, and what to be mindful of when using these models with TorchScript.
-
-Exporting a model needs two things:
-
-* dummy inputs to execute a model forward pass.
-* the model needs to be instantiated with the ``torchscript`` flag.
-
-These necessities imply several things developers should be careful about. These are detailed below.
-
-
-Implications
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-TorchScript flag and tied weights
-------------------------------------------------
-This flag is necessary because most of the language models in this repository have tied weights between their
-``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights,
-it is therefore necessary to untie the weights beforehand.
-
-This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
-separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
-leading to unexpected results.
-
-This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
-can be safely exported without the ``torchscript`` flag.
-
-Dummy inputs and standard lengths
-------------------------------------------------
-
-The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
-Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
-to create the "trace" of the model.
-
-The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
-input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
-as:
-
-``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
-
-will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
-input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
-will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
-resulting in more calculations.
-
-It is recommended to be careful of the total number of operations done on each input and to follow performance closely
-when exporting varying sequence-length models.
-
-Using TorchScript in Python
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Below are examples of using the Python to save, load models as well as how to use the trace for inference.
-
-Saving a model
-------------------------------------------------
-
-This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
-according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
-
-.. code-block:: python
-
-    from transformers import BertModel, BertTokenizer, BertConfig
-    import torch
-
-    enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    # Tokenizing input text
-    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-    tokenized_text = enc.tokenize(text)
-
-    # Masking one of the input tokens
-    masked_index = 8
-    tokenized_text[masked_index] = '[MASK]'
-    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-    # Creating a dummy input
-    tokens_tensor = torch.tensor([indexed_tokens])
-    segments_tensors = torch.tensor([segments_ids])
-    dummy_input = [tokens_tensor, segments_tensors]
-
-    # Initializing the model with the torchscript flag
-    # Flag set to True even though it is not necessary as this model does not have an LM Head.
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
-
-    # Instantiating the model
-    model = BertModel(config)
-
-    # The model needs to be in evaluation mode
-    model.eval()
-
-    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
-    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-    # Creating the trace
-    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-    torch.jit.save(traced_model, "traced_bert.pt")
-
-Loading a model
-------------------------------------------------
-
-This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
-We are re-using the previously initialised ``dummy_input``.
-
-.. code-block:: python
-
-    loaded_model = torch.jit.load("traced_model.pt")
-    loaded_model.eval()
-
-    all_encoder_layers, pooled_output = loaded_model(dummy_input)
-
-Using a traced model for inference
-------------------------------------------------
-
-Using the traced model for inference is as simple as using its ``__call__`` dunder method:
-
-.. code-block:: python
-
-    traced_model(tokens_tensor, segments_tensors)
diff --git a/server/transformers/examples/README.md b/server/transformers/examples/README.md
deleted file mode 100644
index d161d1b832bdd994f08b8564e3ee06fe71524afd..0000000000000000000000000000000000000000
--- a/server/transformers/examples/README.md
+++ /dev/null
@@ -1,801 +0,0 @@
-# Examples
-
-In this section a few examples are put together. All of these examples work for several models, making use of the very
-similar API between the different models.
-
-**Important**  
-To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
-Execute the following steps in a new virtual environment:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-pip install -r ./examples/requirements.txt
-```
-
-| Section                    | Description                                                                                                                                                |
-|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
-| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
-| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
-| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
-| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training.                                                                                  |
-| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
-| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
-| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language
-inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
-
-## TensorFlow 2.0 Bert models on GLUE
-
-Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
-
-Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
-
-This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
-Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
-These options and the below benchmark are provided by @tlkh.
-
-Quick benchmarks from the script (no other modifications):
-
-| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
-| --------- | -------- | ----------------------- | ----------------------|
-| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
-| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
-| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
-| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
-| 1080 Ti | FP32 | 55s | - |
-
-Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
-
-## Language model fine-tuning
-
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
-
-Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
-to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
-are fine-tuned using a masked language modeling (MLM) loss.
-
-Before running the following example, you should get a file that contains text on which the language model will be
-fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
-
-We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
-text that will be used for evaluation.
-
-### GPT-2/GPT and causal language modeling
-
-The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
-the tokenization). The loss here is that of causal language modeling.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_lm_finetuning.py \
-    --output_dir=output \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2 \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE
-```
-
-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
-a score of ~20 perplexity once fine-tuned on the dataset.
-
-### RoBERTa/BERT and masked language modeling
-
-The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
-as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
-pre-training: masked language modeling. 
-
-In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
-slightly slower (over-fitting takes more epochs).
-
-We use the `--mlm` flag so that the script may change its loss function.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_lm_finetuning.py \
-    --output_dir=output \
-    --model_type=roberta \
-    --model_name_or_path=roberta-base \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE \
-    --mlm
-```
-
-## Language generation
-
-Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
-
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
-can try out the different models available in the library.
-
-Example usage:
-
-```bash
-python run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2
-```
-
-## GLUE
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
-
-Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
-Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
-
-GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
-batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
-between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
-
-| Task  | Metric                       | Result      |
-|-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 49.23       |
-| SST-2 | Accuracy                     | 91.97       |
-| MRPC  | F1/Accuracy                  | 89.47/85.29 |
-| STS-B | Person/Spearman corr.        | 83.95/83.70 |
-| QQP   | Accuracy/F1                  | 88.40/84.31 |
-| MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
-| QNLI  | Accuracy                     | 87.46       |
-| RTE   | Accuracy                     | 61.73       |
-| WNLI  | Accuracy                     | 45.07       |
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/$TASK_NAME \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
-In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
-output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
-CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
-said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
-since the data processor for each task inherits from the base class DataProcessor.
-
-### MRPC
-
-#### Fine-tuning example
-
-The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
-than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
-```
-
-Our test ran on a few seeds with [the original implementation hyper-
-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
-results between 84% and 88%.
-
-#### Using Apex and mixed-precision
-
-Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
-[apex](https://github.com/NVIDIA/apex), then run the following example:
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/ \
-  --fp16
-```
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
-reaches F1 > 92 on MRPC.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name MRPC \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MRPC/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-acc = 0.8823529411764706
-acc_and_f1 = 0.901702786377709
-eval_loss = 0.3418912578906332
-f1 = 0.9210526315789473
-global_step = 174
-loss = 0.07231863956341798
-```
-
-### MNLI
-
-The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name mnli \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MNLI/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir output_dir \
-```
-
-The results  are the following:
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-## Multiple Choice
-
-Based on the script [`run_multiple_choice.py`]().
-
-#### Fine-tuning on SWAG
-Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
-
-```bash
-#training on 4 tesla V100(16GB) GPUS
-export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/run_multiple_choice.py \
---model_type roberta \
---task_name swag \
---model_name_or_path roberta-base \
---do_train \
---do_eval \
---do_lower_case \
---data_dir $SWAG_DIR \
---learning_rate 5e-5 \
---num_train_epochs 3 \
---max_seq_length 80 \
---output_dir models_bert/swag_base \
---per_gpu_eval_batch_size=16 \
---per_gpu_train_batch_size=16 \
---gradient_accumulation_steps 2 \
---overwrite_output
-```
-Training with the defined hyper-parameters yields the following results:
-```
-***** Eval results *****
-eval_acc = 0.8338998300509847
-eval_loss = 0.44457291918821606
-```
-
-## SQuAD
-
-Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
-
-#### Fine-tuning BERT on SQuAD1.0
-
-This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
-on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
-$SQUAD_DIR directory.
-
-* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-
-And for SQuAD2.0, you need to download:
-
-- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
-- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
-- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --per_gpu_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 88.52
-exact_match = 81.22
-```
-
-#### Distributed training
-
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-`bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-#### Fine-tuning XLNet on SQuAD
-
-This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
-
-##### Command for SQuAD1.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python /data/home/hlu/transformers/examples/run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file /data/home/hlu/notebooks/NLP/examples/question_answering/train-v1.1.json \
-    --predict_file /data/home/hlu/notebooks/NLP/examples/question_answering/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=4  \
-    --per_gpu_train_batch_size=4   \
-    --save_steps 5000
-```
-
-##### Command for SQuAD2.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --version_2_with_negative \
-    --train_file $SQUAD_DIR/train-v2.0.json \
-    --predict_file $SQUAD_DIR/dev-v2.0.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 4 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=2  \
-    --per_gpu_train_batch_size=2   \
-    --save_steps 5000
-```
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
-
-
-
-## Named Entity Recognition
-
-Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
-[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
-This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
-Details and results for the fine-tuning provided by @stefan-it.
-
-### Data (Download and pre-processing steps)
-
-Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
-
-Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
-
-```bash
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-```
-
-The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`. One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s. I wrote a script that a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
-
-```bash
-wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
-```
-Let's define some variables that we need for further pre-processing steps and training the model:
-
-```bash
-export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
-```
-
-Run the pre-processing script on training, dev and test datasets:
-
-```bash
-python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-```
-
-The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
-
-```bash
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-```
-
-### Prepare the run
-
-Additional environment variables must be set:
-
-```bash
-export OUTPUT_DIR=germeval-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-```
-
-### Run the Pytorch version
-
-To start training, just run:
-
-```bash
-python3 run_ner.py --data_dir ./ \
---model_type bert \
---labels ./labels.txt \
---model_name_or_path $BERT_MODEL \
---output_dir $OUTPUT_DIR \
---max_seq_length  $MAX_LENGTH \
---num_train_epochs $NUM_EPOCHS \
---per_gpu_train_batch_size $BATCH_SIZE \
---save_steps $SAVE_STEPS \
---seed $SEED \
---do_train \
---do_eval \
---do_predict
-```
-
-If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-
-```bash
-10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
-10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
-10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
-10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
-```
-
-On the test dataset the following results could be achieved:
-
-```bash
-10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
-10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
-10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
-10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
-```
-
-#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
-
-Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
-
-| Model | F-Score Dev | F-Score Test
-| --------------------------------- | ------- | --------
-| `bert-large-cased`            | 95.59 | 91.70
-| `roberta-large`                  | 95.96 | 91.87
-| `distilbert-base-uncased` | 94.34 | 90.32
-
-### Run the Tensorflow 2 version
-
-To start training, just run:
-
-```bash
-python3 run_tf_ner.py --data_dir ./ \
---model_type bert \
---labels ./labels.txt \
---model_name_or_path $BERT_MODEL \
---output_dir $OUTPUT_DIR \
---max_seq_length  $MAX_LENGTH \
---num_train_epochs $NUM_EPOCHS \
---per_device_train_batch_size $BATCH_SIZE \
---save_steps $SAVE_STEPS \
---seed $SEED \
---do_train \
---do_eval \
---do_predict
-```
-
-Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-```bash
-           precision    recall  f1-score   support
-
- LOCderiv     0.7619    0.6154    0.6809        52
-  PERpart     0.8724    0.8997    0.8858      4057
-  OTHpart     0.9360    0.9466    0.9413       711
-  ORGpart     0.7015    0.6989    0.7002       269
-  LOCpart     0.7668    0.8488    0.8057       496
-      LOC     0.8745    0.9191    0.8963       235
- ORGderiv     0.7723    0.8571    0.8125        91
- OTHderiv     0.4800    0.6667    0.5581        18
-      OTH     0.5789    0.6875    0.6286        16
- PERderiv     0.5385    0.3889    0.4516        18
-      PER     0.5000    0.5000    0.5000         2
-      ORG     0.0000    0.0000    0.0000         3
-
-micro avg     0.8574    0.8862    0.8715      5968
-macro avg     0.8575    0.8862    0.8713      5968
-```
-
-On the test dataset the following results could be achieved:
-```bash
-           precision    recall  f1-score   support
-
-  PERpart     0.8847    0.8944    0.8896      9397
-  OTHpart     0.9376    0.9353    0.9365      1639
-  ORGpart     0.7307    0.7044    0.7173       697
-      LOC     0.9133    0.9394    0.9262       561
-  LOCpart     0.8058    0.8157    0.8107      1150
-      ORG     0.0000    0.0000    0.0000         8
- OTHderiv     0.5882    0.4762    0.5263        42
- PERderiv     0.6571    0.5227    0.5823        44
-      OTH     0.4906    0.6667    0.5652        39
- ORGderiv     0.7016    0.7791    0.7383       172
- LOCderiv     0.8256    0.6514    0.7282       109
-      PER     0.0000    0.0000    0.0000        11
-
-micro avg     0.8722    0.8774    0.8748     13869
-macro avg     0.8712    0.8774    0.8740     13869
-```
-
-## XNLI
-
-Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
-
-[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
-
-#### Fine-tuning on XNLI
-
-This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
-on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a 
-`$XNLI_DIR` directory.
-
-* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
-* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
-
-```bash
-export XNLI_DIR=/path/to/XNLI
-
-python run_xnli.py \
-  --model_type bert \
-  --model_name_or_path bert-base-multilingual-cased \
-  --language de \
-  --train_language en \
-  --do_train \
-  --do_eval \
-  --data_dir $XNLI_DIR \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 5e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 128 \
-  --output_dir /tmp/debug_xnli/ \
-  --save_steps -1
-```
-
-Training with the previously defined hyper-parameters yields the following results on the **test** set:
-
-```bash
-acc = 0.7093812375249501
-```
-
-## MM-IMDb
-
-Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
-
-[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
-
-### Training on MM-IMDb
-
-```
-python run_mmimdb.py \
-    --data_dir /path/to/mmimdb/dataset/ \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --output_dir /path/to/save/dir/ \
-    --do_train \
-    --do_eval \
-    --max_seq_len 512 \
-    --gradient_accumulation_steps 20 \
-    --num_image_embeds 3 \
-    --num_train_epochs 100 \
-    --patience 5
-```
-
-## Adversarial evaluation of model performances
-
-Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
-
-The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
-
-This is an example of using test_hans.py:
-
-```bash
-export HANS_DIR=path-to-hans
-export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
-export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-
-python examples/test_hans.py \
-        --task_name hans \
-        --model_type $MODEL_TYPE \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $HANS_DIR \
-        --model_name_or_path $MODEL_PATH \
-        --max_seq_length 128 \
-        -output_dir $MODEL_PATH \
-```
-
-This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
-
-The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
-
-```bash
-Heuristic entailed results:
-lexical_overlap: 0.9702
-subsequence: 0.9942
-constituent: 0.9962
-
-Heuristic non-entailed results:
-lexical_overlap: 0.199
-subsequence: 0.0396
-constituent: 0.118
-```
diff --git a/server/transformers/examples/benchmarks.py b/server/transformers/examples/benchmarks.py
deleted file mode 100644
index 07de19d4b518674bb27dd0b5d2b378bfe934e576..0000000000000000000000000000000000000000
--- a/server/transformers/examples/benchmarks.py
+++ /dev/null
@@ -1,531 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Benchmarking the library on inference and training """
-
-# If checking the tensors placement
-# tf.debugging.set_log_device_placement(True)
-
-import argparse
-import csv
-import timeit
-from time import time
-from typing import List
-
-from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers import TFAutoModel
-
-if is_torch_available():
-    import torch
-    from transformers import AutoModel
-
-
-input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
-the Director of Hatcheries and Conditioning entered the room, in the
-
-
-
-scarcely breathing silence, the absent-minded, soliloquizing hum or
-whistle, of absorbed concentration. A troop of newly arrived students,
-very young, pink and callow, followed nervously, rather abjectly, at the
-Director's heels. Each of them carried a notebook, in which, whenever
-the great man spoke, he desperately scribbled. Straight from the
-horse's mouth. It was a rare privilege. The D. H. C. for Central London
-always made a point of personally conducting his new students round
-the various departments.
-
-"Just to give you a general idea," he would explain to them. For of
-course some sort of general idea they must have, if they were to do
-their work intelligently-though as little of one, if they were to be good
-and happy members of society, as possible. For particulars, as every
-one knows, make for virtue and happiness; generalities are intellectu-
-ally necessary evils. Not philosophers but fret-sawyers and stamp col-
-lectors compose the backbone of society.
-
-"To-morrow," he would add, smiling at them with a slightly menacing
-geniality, "you'll be settling down to serious work. You won't have time
-for generalities. Meanwhile ..."
-
-Meanwhile, it was a privilege. Straight from the horse's mouth into the
-notebook. The boys scribbled like mad.
-
-Tall and rather thin but upright, the Director advanced into the room.
-He had a long chin and big rather prominent teeth, just covered, when
-he was not talking, by his full, floridly curved lips. Old, young? Thirty?
-Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
-arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
-
-"I shall begin at the beginning," said the D.H.C. and the more zealous
-students recorded his intention in their notebooks: Begin at the begin-
-ning. "These," he waved his hand, "are the incubators." And opening
-an insulated door he showed them racks upon racks of numbered test-
-tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
-whereas the male gametes," and here he opened another door, "they
-have to be kept at thirty-five instead of thirty-seven. Full blood heat
-sterilizes." Rams wrapped in theremogene beget no lambs.
-
-Still leaning against the incubators he gave them, while the pencils
-scurried illegibly across the pages, a brief description of the modern
-
-
-
-fertilizing process; spoke first, of course, of its surgical introduc-
-tion-"the operation undergone voluntarily for the good of Society, not
-to mention the fact that it carries a bonus amounting to six months'
-salary"; continued with some account of the technique for preserving
-the excised ovary alive and actively developing; passed on to a consid-
-eration of optimum temperature, salinity, viscosity; referred to the liq-
-uor in which the detached and ripened eggs were kept; and, leading
-his charges to the work tables, actually showed them how this liquor
-was drawn off from the test-tubes; how it was let out drop by drop
-onto the specially warmed slides of the microscopes; how the eggs
-which it contained were inspected for abnormalities, counted and
-transferred to a porous receptacle; how (and he now took them to
-watch the operation) this receptacle was immersed in a warm bouillon
-containing free-swimming spermatozoa-at a minimum concentration
-of one hundred thousand per cubic centimetre, he insisted; and how,
-after ten minutes, the container was lifted out of the liquor and its
-contents re-examined; how, if any of the eggs remained unfertilized, it
-was again immersed, and, if necessary, yet again; how the fertilized
-ova went back to the incubators; where the Alphas and Betas re-
-mained until definitely bottled; while the Gammas, Deltas and Epsilons
-were brought out again, after only thirty-six hours, to undergo Bo-
-kanovsky's Process.
-
-"Bokanovsky's Process," repeated the Director, and the students un-
-derlined the words in their little notebooks.
-
-One egg, one embryo, one adult-normality. But a bokanovskified egg
-will bud, will proliferate, will divide. From eight to ninety-six buds, and
-every bud will grow into a perfectly formed embryo, and every embryo
-into a full-sized adult. Making ninety-six human beings grow where
-only one grew before. Progress.
-
-"Essentially," the D.H.C. concluded, "bokanovskification consists of a
-series of arrests of development. We check the normal growth and,
-paradoxically enough, the egg responds by budding."
-
-Responds by budding. The pencils were busy.
-
-He pointed. On a very slowly moving band a rack-full of test-tubes was
-entering a large metal box, another, rack-full was emerging. Machinery
-faintly purred. It took eight minutes for the tubes to go through, he
-
-
-
-told them. Eight minutes of hard X-rays being about as much as an
-egg can stand. A few died; of the rest, the least susceptible divided
-into two; most put out four buds; some eight; all were returned to the
-incubators, where the buds began to develop; then, after two days,
-were suddenly chilled, chilled and checked. Two, four, eight, the buds
-in their turn budded; and having budded were dosed almost to death
-with alcohol; consequently burgeoned again and having budded-bud
-out of bud out of bud-were thereafter-further arrest being generally
-fatal-left to develop in peace. By which time the original egg was in a
-fair way to becoming anything from eight to ninety-six embryos- a
-prodigious improvement, you will agree, on nature. Identical twins-but
-not in piddling twos and threes as in the old viviparous days, when an
-egg would sometimes accidentally divide; actually by dozens, by
-scores at a time.
-
-"Scores," the Director repeated and flung out his arms, as though he
-were distributing largesse. "Scores."
-
-But one of the students was fool enough to ask where the advantage
-lay.
-
-"My good boy!" The Director wheeled sharply round on him. "Can't you
-see? Can't you see?" He raised a hand; his expression was solemn.
-"Bokanovsky's Process is one of the major instruments of social stabil-
-ity!"
-
-Major instruments of social stability.
-
-Standard men and women; in uniform batches. The whole of a small
-factory staffed with the products of a single bokanovskified egg.
-
-"Ninety-six identical twins working ninety-six identical machines!" The
-voice was almost tremulous with enthusiasm. "You really know where
-you are. For the first time in history." He quoted the planetary motto.
-"Community, Identity, Stability." Grand words. "If we could bo-
-kanovskify indefinitely the whole problem would be solved."
-
-Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
-lions of identical twins. The principle of mass production at last applied
-to biology.
-
-
-
-"But, alas," the Director shook his head, "we can't bokanovskify indefi-
-nitely."
-
-Ninety-six seemed to be the limit; seventy-two a good average. From
-the same ovary and with gametes of the same male to manufacture as
-many batches of identical twins as possible-that was the best (sadly a
-second best) that they could do. And even that was difficult.
-
-"For in nature it takes thirty years for two hundred eggs to reach ma-
-turity. But our business is to stabilize the population at this moment,
-here and now. Dribbling out twins over a quarter of a century-what
-would be the use of that?"
-
-Obviously, no use at all. But Podsnap's Technique had immensely ac-
-celerated the process of ripening. They could make sure of at least a
-hundred and fifty mature eggs within two years. Fertilize and bo-
-kanovskify-in other words, multiply by seventy-two-and you get an
-average of nearly eleven thousand brothers and sisters in a hundred
-and fifty batches of identical twins, all within two years of the same
-age.
-
-"And in exceptional cases we can make one ovary yield us over fifteen
-thousand adult individuals."
-
-Beckoning to a fair-haired, ruddy young man who happened to be
-passing at the moment. "Mr. Foster," he called. The ruddy young man
-approached. "Can you tell us the record for a single ovary, Mr. Foster?"
-
-"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
-out hesitation. He spoke very quickly, had a vivacious blue eye, and
-took an evident pleasure in quoting figures. "Sixteen thousand and
-twelve; in one hundred and eighty-nine batches of identicals. But of
-course they've done much better," he rattled on, "in some of the tropi-
-cal Centres. Singapore has often produced over sixteen thousand five
-hundred; and Mombasa has actually touched the seventeen thousand
-mark. But then they have unfair advantages. You should see the way a
-negro ovary responds to pituitary! It's quite astonishing, when you're
-used to working with European material. Still," he added, with a laugh
-(but the light of combat was in his eyes and the lift of his chin was
-challenging), "still, we mean to beat them if we can. I'm working on a
-wonderful Delta-Minus ovary at this moment. Only just eighteen
-
-
-
-months old. Over twelve thousand seven hundred children already, ei-
-ther decanted or in embryo. And still going strong. We'll beat them
-yet."
-
-"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
-the shoulder. "Come along with us, and give these boys the benefit of
-your expert knowledge."
-
-Mr. Foster smiled modestly. "With pleasure." They went.
-In the Bottling Room all was harmonious bustle and ordered activity.
-Flaps of fresh sow's peritoneum ready cut to the proper size came
-shooting up in little lifts from the Organ Store in the sub-basement.
-Whizz and then, click! the lift-hatches hew open; the bottle-liner had
-only to reach out a hand, take the flap, insert, smooth-down, and be-
-fore the lined bottle had had time to travel out of reach along the end-
-less band, whizz, click! another flap of peritoneum had shot up from
-the depths, ready to be slipped into yet another bottle, the next of that
-slow interminable procession on the band.
-
-Next to the Liners stood the Matriculators. The procession advanced;
-one by one the eggs were transferred from their test-tubes to the
-larger containers; deftly the peritoneal lining was slit, the morula
-dropped into place, the saline solution poured in ... and already the
-bottle had passed, and it was the turn of the labellers. Heredity, date
-of fertilization, membership of Bokanovsky Group-details were trans-
-ferred from test-tube to bottle. No longer anonymous, but named,
-identified, the procession marched slowly on; on through an opening in
-the wall, slowly on into the Social Predestination Room.
-"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
-as they entered."""
-
-
-def create_setup_and_compute(
-    model_names: List[str],
-    gpu: bool = True,
-    tensorflow: bool = False,
-    average_over: int = 3,
-    torchscript: bool = False,
-    xla: bool = False,
-    amp: bool = False,
-    fp16: bool = False,
-    save_to_csv: bool = False,
-    csv_filename: str = f"results_{round(time())}.csv",
-):
-    if xla:
-        tf.config.optimizer.set_jit(True)
-    if amp:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if tensorflow:
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
-    else:
-        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
-
-    print("=========== RESULTS ===========")
-    for model_name in model_names:
-        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
-        for batch_size in results[model_name]["bs"]:
-            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
-            for slice_size in results[model_name]["ss"]:
-                result = results[model_name]["results"][batch_size][slice_size]
-                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
-                else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
-
-    if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-
-            for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-
-def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
-        model = AutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-
-        for batch_size in batch_sizes:
-            if fp16:
-                model.half()
-            model.to(device)
-            model.eval()
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
-                    try:
-                        if torchscript:
-                            print("Tracing model with sequence size", sequence.shape)
-                            inference = torch.jit.trace(model, sequence)
-                            inference(sequence)
-                        else:
-                            inference = model
-                            inference(sequence)
-
-                        print("Going through model with sequence of shape", sequence.shape)
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                    except RuntimeError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def _compute_tensorflow(model_names, dictionary, average_over, amp):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name)
-        model = TFAutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-
-        print("Using model", model)
-
-        @tf.function
-        def inference(inputs):
-            return model(inputs)
-
-        for batch_size in batch_sizes:
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = tf.stack(
-                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
-                    )
-
-                    try:
-                        print("Going through model with sequence of shape", sequence.shape)
-                        # To make sure that the model is traced + that the tensors are on the appropriate device
-                        inference(sequence)
-
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                    except tf.errors.ResourceExhaustedError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--models",
-        required=False,
-        type=str,
-        default="all",
-        help="Model checkpoints to be provided "
-        "to the AutoModel classes. Leave "
-        "blank to benchmark the base version "
-        "of all available model "
-        "architectures.",
-    )
-    parser.add_argument(
-        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
-    )
-    parser.add_argument(
-        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
-    )
-    parser.add_argument(
-        "--torchscript",
-        required=False,
-        action="store_true",
-        help="Pytorch only: trace the models " "using torchscript",
-    )
-    parser.add_argument(
-        "--tensorflow",
-        required=False,
-        action="store_true",
-        help="Benchmark the TensorFlow version "
-        "of the models. Will run on GPU if "
-        "the correct dependencies are "
-        "installed",
-    )
-    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument(
-        "--amp",
-        required=False,
-        action="store_true",
-        help="TensorFlow only: use automatic mixed precision acceleration.",
-    )
-    parser.add_argument(
-        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
-    )
-    parser.add_argument(
-        "--keras_predict",
-        required=False,
-        action="store_true",
-        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
-    )
-    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument(
-        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
-    )
-    parser.add_argument(
-        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
-    )
-
-    args = parser.parse_args()
-    if args.models == "all":
-        args.models = [
-            "gpt2",
-            "bert-base-cased",
-            "xlnet-base-cased",
-            "xlm-mlm-en-2048",
-            "transfo-xl-wt103",
-            "openai-gpt",
-            "distilbert-base-uncased",
-            "distilgpt2",
-            "roberta-base",
-            "ctrl",
-        ]
-    else:
-        args.models = args.models.split()
-
-    print("Running with arguments", args)
-
-    if args.torch:
-        if is_torch_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                tensorflow=False,
-                gpu=args.torch_cuda,
-                torchscript=args.torchscript,
-                fp16=args.fp16,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-            )
-        else:
-            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
-
-    if args.tensorflow:
-        if is_tf_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                tensorflow=True,
-                xla=args.xla,
-                amp=args.amp,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-            )
-        else:
-            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/contrib/README.md b/server/transformers/examples/contrib/README.md
deleted file mode 100644
index f2d0616e629bcc7d7800d1a4b727e725379ac736..0000000000000000000000000000000000000000
--- a/server/transformers/examples/contrib/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Community contributed examples
-
-This folder contains examples which are not actively maintained (mostly contributed by the community).
-
-Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
diff --git a/server/transformers/examples/contrib/run_camembert.py b/server/transformers/examples/contrib/run_camembert.py
deleted file mode 100644
index 3da66d419b96885b7d4186619174a548bd0abe20..0000000000000000000000000000000000000000
--- a/server/transformers/examples/contrib/run_camembert.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-
-from transformers.modeling_camembert import CamembertForMaskedLM
-from transformers.tokenization_camembert import CamembertTokenizer
-
-
-def fill_mask(masked_input, model, tokenizer, topk=5):
-    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
-    assert masked_input.count("<mask>") == 1
-    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
-    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
-    logits = logits[0, masked_index, :]
-    prob = logits.softmax(dim=0)
-    values, indices = prob.topk(k=topk, dim=0)
-    topk_predicted_token_bpe = " ".join(
-        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
-    )
-    masked_token = tokenizer.mask_token
-    topk_filled_outputs = []
-    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
-        predicted_token = predicted_token_bpe.replace("\u2581", " ")
-        if " {0}".format(masked_token) in masked_input:
-            topk_filled_outputs.append(
-                (
-                    masked_input.replace(" {0}".format(masked_token), predicted_token),
-                    values[index].item(),
-                    predicted_token,
-                )
-            )
-        else:
-            topk_filled_outputs.append(
-                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
-            )
-    return topk_filled_outputs
-
-
-tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
-model = CamembertForMaskedLM.from_pretrained("camembert-base")
-model.eval()
-
-masked_input = "Le camembert est <mask> :)"
-print(fill_mask(masked_input, model, tokenizer, topk=3))
diff --git a/server/transformers/examples/contrib/run_openai_gpt.py b/server/transformers/examples/contrib/run_openai_gpt.py
deleted file mode 100644
index 136e25821f1c1e4526c7ef6aa6453e6b3d8ff89e..0000000000000000000000000000000000000000
--- a/server/transformers/examples/contrib/run_openai_gpt.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT model fine-tuning script.
-    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
-    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
-
-    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
-        python run_openai_gpt.py \
-          --model_name openai-gpt \
-          --do_train \
-          --do_eval \
-          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
-          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
-          --output_dir ../log \
-          --train_batch_size 16 \
-"""
-import argparse
-import csv
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from tqdm import tqdm, trange
-
-from transformers import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    AdamW,
-    OpenAIGPTDoubleHeadsModel,
-    OpenAIGPTTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-def load_rocstories_dataset(dataset_path):
-    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding="utf_8") as f:
-        f = csv.reader(f)
-        output = []
-        next(f)  # skip the first line
-        for line in tqdm(f):
-            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
-    return output
-
-
-def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
-    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
-
-        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
-    """
-    tensor_datasets = []
-    for dataset in encoded_datasets:
-        n_batch = len(dataset)
-        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
-        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
-        mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
-            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
-            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, : len(with_cont1)] = with_cont1
-            input_ids[i, 1, : len(with_cont2)] = with_cont2
-            mc_token_ids[i, 0] = len(with_cont1) - 1
-            mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, : len(with_cont1)] = with_cont1
-            lm_labels[i, 1, : len(with_cont2)] = with_cont2
-            mc_labels[i] = mc_label
-        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
-        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
-    return tensor_datasets
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--train_dataset", type=str, default="")
-    parser.add_argument("--eval_dataset", type=str, default="")
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--num_train_epochs", type=int, default=3)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--eval_batch_size", type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", type=int, default=1)
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training \
-                        steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before\
-                        performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
-    parser.add_argument("--weight_decay", type=float, default=0.01)
-    parser.add_argument("--lm_coef", type=float, default=0.9)
-    parser.add_argument("--n_valid", type=int, default=374)
-
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    print(args)
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    torch.cuda.manual_seed_all(args.seed)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    logger.info("device: {}, n_gpu {}".format(device, n_gpu))
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    # Load tokenizer and model
-    # This loading functions also add new tokens and embeddings called `special tokens`
-    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ["_start_", "_delimiter_", "_classify_"]
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
-    tokenizer.add_tokens(special_tokens)
-    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
-    model.resize_token_embeddings(len(tokenizer))
-    model.to(device)
-
-    # Load and encode the datasets
-    def tokenize_and_encode(obj):
-        """ Tokenize and encode a nested object """
-        if isinstance(obj, str):
-            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
-        elif isinstance(obj, int):
-            return obj
-        return list(tokenize_and_encode(o) for o in obj)
-
-    logger.info("Encoding dataset...")
-    train_dataset = load_rocstories_dataset(args.train_dataset)
-    eval_dataset = load_rocstories_dataset(args.eval_dataset)
-    datasets = (train_dataset, eval_dataset)
-    encoded_datasets = tokenize_and_encode(datasets)
-
-    # Compute the max input length for the Transformer
-    max_length = model.config.n_positions // 2 - 2
-    input_length = max(
-        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
-        for dataset in encoded_datasets
-        for story, cont1, cont2, _ in dataset
-    )
-    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
-
-    # Prepare inputs tensors and dataloaders
-    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
-    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
-
-    train_data = TensorDataset(*train_tensor_dataset)
-    train_sampler = RandomSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    eval_data = TensorDataset(*eval_tensor_dataset)
-    eval_sampler = SequentialSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Prepare optimizer
-    if args.do_train:
-        if args.max_steps > 0:
-            t_total = args.max_steps
-            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-        else:
-            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        param_optimizer = list(model.named_parameters())
-        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-                "weight_decay": args.weight_decay,
-            },
-            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-        ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-        )
-
-    if args.do_train:
-        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_steps = 0
-            tqdm_bar = tqdm(train_dataloader, desc="Training")
-            for step, batch in enumerate(tqdm_bar):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, mc_token_ids, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
-                loss = args.lm_coef * losses[0] + losses[1]
-                loss.backward()
-                scheduler.step()
-                optimizer.step()
-                optimizer.zero_grad()
-                tr_loss += loss.item()
-                exp_average_loss = (
-                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
-                )
-                nb_tr_steps += 1
-                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
-
-    # Save a trained model
-    if args.do_train:
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
-        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
-        model.to(device)
-
-    if args.do_eval:
-        model.eval()
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            batch = tuple(t.to(device) for t in batch)
-            input_ids, mc_token_ids, lm_labels, mc_labels = batch
-            with torch.no_grad():
-                _, mc_loss, _, mc_logits = model(
-                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
-                )
-
-            mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to("cpu").numpy()
-            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
-
-            eval_loss += mc_loss.mean().item()
-            eval_accuracy += tmp_eval_accuracy
-
-            nb_eval_examples += input_ids.size(0)
-            nb_eval_steps += 1
-
-        eval_loss = eval_loss / nb_eval_steps
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss / nb_tr_steps if args.do_train else None
-        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/contrib/run_swag.py b/server/transformers/examples/contrib/run_swag.py
deleted file mode 100644
index 497ddeca9de3e4687017fa0c6526523199693ff5..0000000000000000000000000000000000000000
--- a/server/transformers/examples/contrib/run_swag.py
+++ /dev/null
@@ -1,737 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner.
-   Finetuning the library models for multiple choice on SWAG (Bert).
-"""
-
-
-import argparse
-import csv
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-}
-
-
-class SwagExample(object):
-    """A single training/test example for the SWAG dataset."""
-
-    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
-        self.swag_id = swag_id
-        self.context_sentence = context_sentence
-        self.start_ending = start_ending
-        self.endings = [
-            ending_0,
-            ending_1,
-            ending_2,
-            ending_3,
-        ]
-        self.label = label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        attributes = [
-            "swag_id: {}".format(self.swag_id),
-            "context_sentence: {}".format(self.context_sentence),
-            "start_ending: {}".format(self.start_ending),
-            "ending_0: {}".format(self.endings[0]),
-            "ending_1: {}".format(self.endings[1]),
-            "ending_2: {}".format(self.endings[2]),
-            "ending_3: {}".format(self.endings[3]),
-        ]
-
-        if self.label is not None:
-            attributes.append("label: {}".format(self.label))
-
-        return ", ".join(attributes)
-
-
-class InputFeatures(object):
-    def __init__(self, example_id, choices_features, label):
-        self.example_id = example_id
-        self.choices_features = [
-            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
-            for _, input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-def read_swag_examples(input_file, is_training=True):
-    with open(input_file, "r", encoding="utf-8") as f:
-        lines = list(csv.reader(f))
-
-    if is_training and lines[0][-1] != "label":
-        raise ValueError("For training, the input file must contain a label column.")
-
-    examples = [
-        SwagExample(
-            swag_id=line[2],
-            context_sentence=line[4],
-            start_ending=line[5],  # in the swag dataset, the
-            # common beginning of each
-            # choice is stored in "sent2".
-            ending_0=line[7],
-            ending_1=line[8],
-            ending_2=line[9],
-            ending_3=line[10],
-            label=int(line[11]) if is_training else None,
-        )
-        for line in lines[1:]  # we skip the line with the column names
-    ]
-
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    # Swag is a multiple choice task. To perform this task using Bert,
-    # we will use the formatting proposed in "Improving Language
-    # Understanding by Generative Pre-Training" and suggested by
-    # @jacobdevlin-google in this issue
-    # https://github.com/google-research/bert/issues/38.
-    #
-    # Each choice will correspond to a sample on which we run the
-    # inference. For a given Swag example, we will create the 4
-    # following inputs:
-    # - [CLS] context [SEP] choice_1 [SEP]
-    # - [CLS] context [SEP] choice_2 [SEP]
-    # - [CLS] context [SEP] choice_3 [SEP]
-    # - [CLS] context [SEP] choice_4 [SEP]
-    # The model will output a single value for each input. To get the
-    # final decision of the model, we will run a softmax over these 4
-    # outputs.
-    features = []
-    for example_index, example in tqdm(enumerate(examples)):
-        context_tokens = tokenizer.tokenize(example.context_sentence)
-        start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
-        choices_features = []
-        for ending_index, ending in enumerate(example.endings):
-            # We create a copy of the context tokens in order to be
-            # able to shrink it according to ending_tokens
-            context_tokens_choice = context_tokens[:]
-            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
-            # Modifies `context_tokens_choice` and `ending_tokens` in
-            # place so that the total length is less than the
-            # specified length.  Account for [CLS], [SEP], [SEP] with
-            # "- 3"
-            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
-            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
-            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding = [0] * (max_seq_length - len(input_ids))
-            input_ids += padding
-            input_mask += padding
-            segment_ids += padding
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
-
-        label = example.label
-        if example_index < 5:
-            logger.info("*** Example ***")
-            logger.info("swag_id: {}".format(example.swag_id))
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(" ".join(tokens)))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
-            if is_training:
-                logger.info("label: {}".format(label))
-
-        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
-
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-def select_field(features, field):
-    return [[choice[field] for choice in feature.choices_features] for feature in features]
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(
-        os.path.dirname(input_file),
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
-    all_label = torch.tensor([f.label for f in features], dtype=torch.long)
-
-    if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-    else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                "token_type_ids": batch[2],
-                "labels": batch[3],
-            }
-            # if args.model_type in ['xlnet', 'xlm']:
-            #     inputs.update({'cls_index': batch[5],
-            #                    'p_mask':       batch[6]})
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-            else:
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-        with torch.no_grad():
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                "token_type_ids": batch[2],
-                "labels": batch[3],
-            }
-
-            # if args.model_type in ['xlnet', 'xlm']:
-            #     inputs.update({'cls_index': batch[4],
-            #                    'p_mask':    batch[5]})
-            outputs = model(**inputs)
-            tmp_eval_loss, logits = outputs[:2]
-            eval_loss += tmp_eval_loss.mean().item()
-
-        logits = logits.detach().cpu().numpy()
-        label_ids = inputs["labels"].to("cpu").numpy()
-        tmp_eval_accuracy = accuracy(logits, label_ids)
-        eval_accuracy += tmp_eval_accuracy
-
-        nb_eval_steps += 1
-        nb_eval_examples += inputs["input_ids"].size(0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
-
-    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results *****")
-        for key in sorted(result.keys()):
-            logger.info("%s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        required=True,
-        help="SWAG csv for predictions. E.g., val.csv or test.csv",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences "
-        "longer than this will be truncated, and sequences shorter than this will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            checkpoints = [args.output_dir]
-        else:
-            # if do_train is False and do_eval is true, load model directly from pretrained.
-            checkpoints = [args.model_name_or_path]
-
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/contrib/run_transfo_xl.py b/server/transformers/examples/contrib/run_transfo_xl.py
deleted file mode 100644
index 84e2806a7b2abc8d2b8d082610db060ca1d68c2d..0000000000000000000000000000000000000000
--- a/server/transformers/examples/contrib/run_transfo_xl.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Transformer XL model evaluation script.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
-
-    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
-"""
-
-
-import argparse
-import logging
-import math
-import time
-
-import torch
-
-from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
-    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
-    parser.add_argument(
-        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
-    )
-    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
-    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
-    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
-    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
-    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
-    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
-    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
-    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
-    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    assert args.ext_len >= 0, "extended context length must be non-negative"
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    logger.info("device: {}".format(device))
-
-    # Load a pre-processed dataset
-    # You can also build the corpus yourself using TransfoXLCorpus methods
-    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
-    # and tokenizing the dataset
-    # The pre-processed corpus is a convertion (using the conversion script )
-    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
-
-    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
-
-    # Load a pre-trained model
-    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
-    model = model.to(device)
-
-    logger.info(
-        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
-            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
-        )
-    )
-
-    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
-    if args.clamp_len > 0:
-        model.clamp_len = args.clamp_len
-    if args.same_length:
-        model.same_length = True
-
-    ###############################################################################
-    # Evaluation code
-    ###############################################################################
-    def evaluate(eval_iter):
-        # Turn on evaluation mode which disables dropout.
-        model.eval()
-        total_len, total_loss = 0, 0.0
-        start_time = time.time()
-        with torch.no_grad():
-            mems = None
-            for idx, (data, target, seq_len) in enumerate(eval_iter):
-                ret = model(data, lm_labels=target, mems=mems)
-                loss, _, mems = ret
-                loss = loss.mean()
-                total_loss += seq_len * loss.item()
-                total_len += seq_len
-            total_time = time.time() - start_time
-        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
-        return total_loss / total_len
-
-    # Run on test data.
-    if args.split == "all":
-        test_loss = evaluate(te_iter)
-        valid_loss = evaluate(va_iter)
-    elif args.split == "valid":
-        valid_loss = evaluate(va_iter)
-        test_loss = None
-    elif args.split == "test":
-        test_loss = evaluate(te_iter)
-        valid_loss = None
-
-    def format_log(loss, split):
-        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
-        return log_str
-
-    log_str = ""
-    if valid_loss is not None:
-        log_str += format_log(valid_loss, "valid")
-    if test_loss is not None:
-        log_str += format_log(test_loss, "test")
-
-    logger.info("=" * 100)
-    logger.info(log_str)
-    logger.info("=" * 100)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/distillation/README.md b/server/transformers/examples/distillation/README.md
deleted file mode 100644
index c8fbb01aa43e95b625eaaf92b7d1091d9d6fddaa..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/README.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# Distil*
-
-This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
-
-**January 20, 2020 - Bug fixing** We have recently discovered and fixed [a bug](https://github.com/huggingface/transformers/commit/48cbf267c988b56c71a2380f748a3e6092ccaed3) in the evaluation of our `run_*.py` scripts that caused the reported metrics to be over-estimated on average. We have updated all the metrics with the latest runs.
-
-**December 6, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
-
-**November 19, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
-
-**October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
-
-**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
-
-**September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
-
-
-## What is Distil*
-
-Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
-
-We have applied the same method to other Transformer architectures and released the weights:
-- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
-- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
-- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
-- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
-
-For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
-
-Here are the results on the dev sets of GLUE:
-
-| Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
-| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
-| BERT-base-uncased         |  **77.6**                      | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1              |
-| DistilBERT-base-uncased   |  **76.8**                      | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3              |
-| ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
-| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
-| DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
-
-<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
-
-<sup>2</sup> Macro-score computed without WNLI.
-
-<sup>3</sup> We compute this score ourselves for completeness.
-
-Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
-
-| Model                        | English | Spanish | Chinese | German | Arabic  | Urdu |
-| :---:                        | :---:   | :---:   | :---:   | :---:  | :---:   | :---:|
-| mBERT base cased (computed)  | 82.1    | 74.6    | 69.1    | 72.3   | 66.4    | 58.5 |
-| mBERT base uncased (reported)| 81.4    | 74.3    | 63.8    | 70.5   | 62.1    | 58.3 |
-| DistilmBERT                  | 78.2    | 69.1    | 64.0    | 66.3   | 59.1    | 54.7 |
-
-## Setup
-
-This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
-
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
-
-
-## How to use DistilBERT
-
-Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
-
-- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
-- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
-- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
-- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
-- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
-- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
-
-Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
-
-```python
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = DistilBertModel.from_pretrained('distilbert-base-uncased')
-
-input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
-outputs = model(input_ids)
-last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-```
-
-Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
-- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
-- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
-- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
-
-
-## How to train Distil*
-
-In the following, we will explain how you can train DistilBERT.
-
-### A. Preparing the data
-
-The weights we release are trained using a concatenation of Toronto Book Corpus and English Wikipedia (same training data as the English version of BERT).
-
-To avoid processing the data several time, we do it once and for all before the training. From now on, will suppose that you have a text file `dump.txt` which contains one sequence per line (a sequence being composed of one of several coherent sentences).
-
-First, we will binarize the data, i.e. tokenize the data and convert each token in an index in our model's vocabulary.
-
-```bash
-python scripts/binarized_data.py \
-    --file_path data/dump.txt \
-    --tokenizer_type bert \
-    --tokenizer_name bert-base-uncased \
-    --dump_file data/binarized_text
-```
-
-Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
-
-```bash
-python scripts/token_counts.py \
-    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
-    --vocab_size 30522
-```
-
-### B. Training
-
-Training with distillation is really simple once you have pre-processed the data:
-
-```bash
-python train.py \
-    --student_type distilbert \
-    --student_config training_configs/distilbert-base-uncased.json \
-    --teacher_type bert \
-    --teacher_name bert-base-uncased \
-    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --alpha_clm 0.0 --mlm \
-    --freeze_pos_embs \
-    --dump_path serialization_dir/my_first_training \
-    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts data/token_counts.bert-base-uncased.pickle \
-    --force # overwrites the `dump_path` if it already exists.
-```
-
-By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
-
-We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
-
-```bash
-export NODE_RANK=0
-export N_NODES=1
-
-export N_GPU_NODE=4
-export WORLD_SIZE=4
-export MASTER_PORT=<AN_OPEN_PORT>
-export MASTER_ADDR=<I.P.>
-
-pkill -f 'python -u train.py'
-
-python -m torch.distributed.launch \
-    --nproc_per_node=$N_GPU_NODE \
-    --nnodes=$N_NODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    train.py \
-        --force \
-        --n_gpu $WORLD_SIZE \
-        --student_type distilbert \
-        --student_config training_configs/distilbert-base-uncased.json \
-        --teacher_type bert \
-        --teacher_name bert-base-uncased \
-        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --alpha_clm 0.0 --mlm \
-        --freeze_pos_embs \
-        --dump_path serialization_dir/my_first_training \
-        --data_file data/binarized_text.bert-base-uncased.pickle \
-        --token_counts data/token_counts.bert-base-uncased.pickle
-```
-
-**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
-
-Happy distillation!
-
-## Citation
-
-If you find the ressource useful, you should cite the following paper:
-
-```
-@inproceedings{sanh2019distilbert,
-  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
-  author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
-  booktitle={NeurIPS EMC^2 Workshop},
-  year={2019}
-}
-```
diff --git a/server/transformers/examples/distillation/distiller.py b/server/transformers/examples/distillation/distiller.py
deleted file mode 100644
index 53669623b6f67a0e6c740717ce86409c67b0ad97..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/distiller.py
+++ /dev/null
@@ -1,603 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" The distiller to distil the student.
-    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-import math
-import os
-import time
-
-import psutil
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.optim import AdamW
-from torch.utils.data import BatchSampler, DataLoader, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
-from lm_seqs_dataset import LmSeqsDataset
-from transformers import get_linear_schedule_with_warmup
-from utils import logger
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-class Distiller:
-    def __init__(
-        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
-    ):
-        logger.info("Initializing Distiller")
-        self.params = params
-        self.dump_path = params.dump_path
-        self.multi_gpu = params.multi_gpu
-        self.fp16 = params.fp16
-
-        self.student = student
-        self.teacher = teacher
-
-        self.student_config = student.config
-        self.vocab_size = student.config.vocab_size
-
-        if params.n_gpu <= 1:
-            sampler = RandomSampler(dataset)
-        else:
-            sampler = DistributedSampler(dataset)
-
-        if params.group_by_size:
-            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
-            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
-        else:
-            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
-
-        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
-
-        self.temperature = params.temperature
-        assert self.temperature > 0.0
-
-        self.alpha_ce = params.alpha_ce
-        self.alpha_mlm = params.alpha_mlm
-        self.alpha_clm = params.alpha_clm
-        self.alpha_mse = params.alpha_mse
-        self.alpha_cos = params.alpha_cos
-
-        self.mlm = params.mlm
-        if self.mlm:
-            logger.info(f"Using MLM loss for LM step.")
-            self.mlm_mask_prop = params.mlm_mask_prop
-            assert 0.0 <= self.mlm_mask_prop <= 1.0
-            assert params.word_mask + params.word_keep + params.word_rand == 1.0
-            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
-            if self.fp16:
-                self.pred_probs = self.pred_probs.half()
-                self.token_probs = self.token_probs.half()
-        else:
-            logger.info(f"Using CLM loss for LM step.")
-
-        self.epoch = 0
-        self.n_iter = 0
-        self.n_total_iter = 0
-        self.n_sequences_epoch = 0
-        self.total_loss_epoch = 0
-        self.last_loss = 0
-        self.last_loss_ce = 0
-        self.last_loss_mlm = 0
-        self.last_loss_clm = 0
-        if self.alpha_mse > 0.0:
-            self.last_loss_mse = 0
-        if self.alpha_cos > 0.0:
-            self.last_loss_cos = 0
-        self.last_log = 0
-
-        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
-        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-        if self.alpha_mse > 0.0:
-            self.mse_loss_fct = nn.MSELoss(reduction="sum")
-        if self.alpha_cos > 0.0:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
-
-        logger.info("--- Initializing model optimizer")
-        assert params.gradient_accumulation_steps >= 1
-        self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = (
-            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
-        )
-
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
-                ],
-                "weight_decay": params.weight_decay,
-            },
-            {
-                "params": [
-                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-        logger.info(
-            "------ Number of trainable parameters (student): %i"
-            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
-        )
-        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(
-            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
-        )
-
-        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = get_linear_schedule_with_warmup(
-            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
-        )
-
-        if self.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(
-                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
-            )
-            self.teacher = self.teacher.half()
-
-        if self.multi_gpu:
-            if self.fp16:
-                from apex.parallel import DistributedDataParallel
-
-                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student)
-            else:
-                from torch.nn.parallel import DistributedDataParallel
-
-                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(
-                    self.student,
-                    device_ids=[params.local_rank],
-                    output_device=params.local_rank,
-                    find_unused_parameters=True,
-                )
-
-        self.is_master = params.is_master
-        if self.is_master:
-            logger.info("--- Initializing Tensorboard")
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
-            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
-
-    def prepare_batch_mlm(self, batch):
-        """
-        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
-
-        Input:
-        ------
-            batch: `Tuple`
-                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
-                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
-
-        Output:
-        -------
-            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
-            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
-        """
-        token_ids, lengths = batch
-        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
-        assert token_ids.size(0) == lengths.size(0)
-
-        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
-
-        bs, max_seq_len = token_ids.size()
-        mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-
-        x_prob = self.token_probs[token_ids.flatten()]
-        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
-        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(
-            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
-        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
-        pred_mask[tgt_ids] = 1
-        pred_mask = pred_mask.view(bs, max_seq_len)
-
-        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
-
-        # mask a number of words == 0 [8] (faster with fp16)
-        if self.fp16:
-            n1 = pred_mask.sum().item()
-            if n1 > 8:
-                pred_mask = pred_mask.view(-1)
-                n2 = max(n1 % 8, 8 * (n1 // 8))
-                if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
-                pred_mask = pred_mask.view(bs, max_seq_len)
-                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
-
-        _token_ids_real = token_ids[pred_mask]
-        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
-        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = (
-            _token_ids_mask * (probs == 0).long()
-            + _token_ids_real * (probs == 1).long()
-            + _token_ids_rand * (probs == 2).long()
-        )
-        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
-
-        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
-
-        # sanity checks
-        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
-
-        return token_ids, attn_mask, mlm_labels
-
-    def prepare_batch_clm(self, batch):
-        """
-        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
-
-        Input:
-        ------
-            batch: `Tuple`
-                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
-                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
-
-        Output:
-        -------
-            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
-            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
-        """
-        token_ids, lengths = batch
-        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
-        assert token_ids.size(0) == lengths.size(0)
-
-        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
-        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
-
-        # sanity checks
-        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
-
-        return token_ids, attn_mask, clm_labels
-
-    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
-        """
-        For float16 only.
-        Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
-
-        Input:
-        ------
-            x: `torch.tensor(bs, seq_length)` - The token ids.
-            lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch.
-
-        Output:
-        -------
-            x:  `torch.tensor(new_bs, new_seq_length)` - The updated token ids.
-            lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths.
-        """
-        if not self.fp16 or len(lengths) < 8:
-            return x, lengths
-
-        # number of sentences == 0 [8]
-        bs1 = len(lengths)
-        bs2 = 8 * (bs1 // 8)
-        assert bs2 > 0 and bs2 % 8 == 0
-        if bs1 != bs2:
-            idx = torch.randperm(bs1)[:bs2]
-            lengths = lengths[idx]
-            slen = lengths.max().item()
-            x = x[idx, :slen]
-        else:
-            idx = None
-
-        # sequence length == 0 [8]
-        ml1 = x.size(1)
-        if ml1 % 8 != 0:
-            pad = 8 - (ml1 % 8)
-            ml2 = ml1 + pad
-            if self.mlm:
-                pad_id = self.params.special_tok_ids["pad_token"]
-            else:
-                pad_id = self.params.special_tok_ids["unk_token"]
-            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
-            x = torch.cat([x, padding_tensor], 1)
-            assert x.size() == (bs2, ml2)
-
-        assert x.size(0) % 8 == 0
-        assert x.size(1) % 8 == 0
-        return x, lengths
-
-    def train(self):
-        """
-        The real training loop.
-        """
-        if self.is_master:
-            logger.info("Starting training")
-        self.last_log = time.time()
-        self.student.train()
-        self.teacher.eval()
-
-        for _ in range(self.params.n_epoch):
-            if self.is_master:
-                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
-            if self.multi_gpu:
-                torch.distributed.barrier()
-
-            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
-            for batch in iter_bar:
-                if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
-
-                if self.mlm:
-                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
-                else:
-                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
-                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
-
-                iter_bar.update()
-                iter_bar.set_postfix(
-                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
-                )
-            iter_bar.close()
-
-            if self.is_master:
-                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
-            self.end_epoch()
-
-        if self.is_master:
-            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
-            logger.info("Training is finished")
-
-    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
-        """
-        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
-        and possibly a parameter update (depending on the gradient accumulation).
-
-        Input:
-        ------
-        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
-        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
-        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
-        """
-        if self.mlm:
-            s_logits, s_hidden_states = self.student(
-                input_ids=input_ids, attention_mask=attention_mask
-            )  # (bs, seq_length, voc_size)
-            with torch.no_grad():
-                t_logits, t_hidden_states = self.teacher(
-                    input_ids=input_ids, attention_mask=attention_mask
-                )  # (bs, seq_length, voc_size)
-        else:
-            s_logits, _, s_hidden_states = self.student(
-                input_ids=input_ids, attention_mask=None
-            )  # (bs, seq_length, voc_size)
-            with torch.no_grad():
-                t_logits, _, t_hidden_states = self.teacher(
-                    input_ids=input_ids, attention_mask=None
-                )  # (bs, seq_length, voc_size)
-        assert s_logits.size() == t_logits.size()
-
-        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
-        if self.params.restrict_ce_to_mask:
-            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
-        else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
-        assert t_logits_slct.size() == s_logits_slct.size()
-
-        loss_ce = (
-            self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
-            )
-            * (self.temperature) ** 2
-        )
-        loss = self.alpha_ce * loss_ce
-
-        if self.alpha_mlm > 0.0:
-            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
-            loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.0:
-            shift_logits = s_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            loss += self.alpha_clm * loss_clm
-
-        if self.alpha_mse > 0.0:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
-                0
-            )  # Reproducing batchmean reduction
-            loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.0:
-            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
-            assert s_hidden_states.size() == t_hidden_states.size()
-            dim = s_hidden_states.size(-1)
-
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
-
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
-            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
-            loss += self.alpha_cos * loss_cos
-
-        self.total_loss_epoch += loss.item()
-        self.last_loss = loss.item()
-        self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.0:
-            self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.0:
-            self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.0:
-            self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.0:
-            self.last_loss_cos = loss_cos.item()
-
-        self.optimize(loss)
-
-        self.n_sequences_epoch += input_ids.size(0)
-
-    def optimize(self, loss):
-        """
-        Normalization on the loss (gradient accumulation or distributed training), followed by
-        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
-        Also update the metrics for tensorboard.
-        """
-        # Check for NaN
-        if (loss != loss).data.any():
-            logger.error("NaN detected")
-            exit()
-
-        if self.multi_gpu:
-            loss = loss.mean()
-        if self.params.gradient_accumulation_steps > 1:
-            loss = loss / self.params.gradient_accumulation_steps
-
-        if self.fp16:
-            from apex import amp
-
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        else:
-            loss.backward()
-
-        self.iter()
-        if self.n_iter % self.params.gradient_accumulation_steps == 0:
-            if self.fp16:
-                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
-            else:
-                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
-            self.optimizer.step()
-            self.optimizer.zero_grad()
-            self.scheduler.step()
-
-    def iter(self):
-        """
-        Update global counts, write to tensorboard and save checkpoint.
-        """
-        self.n_iter += 1
-        self.n_total_iter += 1
-
-        if self.n_total_iter % self.params.log_interval == 0:
-            self.log_tensorboard()
-            self.last_log = time.time()
-        if self.n_total_iter % self.params.checkpoint_interval == 0:
-            self.save_checkpoint()
-
-    def log_tensorboard(self):
-        """
-        Log into tensorboard. Only by the master process.
-        """
-        if not self.is_master:
-            return
-
-        for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(
-                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
-            )
-            self.tensorboard.add_scalar(
-                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
-            )
-            if param.grad is None:
-                continue
-            self.tensorboard.add_scalar(
-                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
-            )
-            self.tensorboard.add_scalar(
-                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
-            )
-
-        self.tensorboard.add_scalar(
-            tag="losses/cum_avg_loss_epoch",
-            scalar_value=self.total_loss_epoch / self.n_iter,
-            global_step=self.n_total_iter,
-        )
-        self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(
-            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
-        )
-        if self.alpha_mlm > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
-            )
-        if self.alpha_clm > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
-            )
-        if self.alpha_mse > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
-            )
-        if self.alpha_cos > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
-            )
-        self.tensorboard.add_scalar(
-            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
-        )
-
-        self.tensorboard.add_scalar(
-            tag="global/memory_usage",
-            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
-            global_step=self.n_total_iter,
-        )
-        self.tensorboard.add_scalar(
-            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
-        )
-
-    def end_epoch(self):
-        """
-        Finally arrived at the end of epoch (full pass on dataset).
-        Do some tensorboard logging and checkpoint saving.
-        """
-        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
-
-        if self.is_master:
-            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
-            self.tensorboard.add_scalar(
-                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
-            )
-
-        self.epoch += 1
-        self.n_sequences_epoch = 0
-        self.n_iter = 0
-        self.total_loss_epoch = 0
-
-    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
-        """
-        Save the current state. Only by the master process.
-        """
-        if not self.is_master:
-            return
-        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
-        mdl_to_save.config.save_pretrained(self.dump_path)
-        state_dict = mdl_to_save.state_dict()
-        torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
diff --git a/server/transformers/examples/distillation/grouped_batch_sampler.py b/server/transformers/examples/distillation/grouped_batch_sampler.py
deleted file mode 100644
index c386c4224d25a9caada95c392269e61699b4b337..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/grouped_batch_sampler.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
-"""
-import bisect
-import copy
-from collections import defaultdict
-
-import numpy as np
-from torch.utils.data.sampler import BatchSampler, Sampler
-
-from utils import logger
-
-
-def _quantize(x, bins):
-    bins = copy.deepcopy(bins)
-    bins = sorted(bins)
-    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
-    return quantized
-
-
-def create_lengths_groups(lengths, k=0):
-    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
-    groups = _quantize(lengths, bins)
-    # count number of elements per group
-    counts = np.unique(groups, return_counts=True)[1]
-    fbins = [0] + bins + [np.inf]
-    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
-    logger.info("Count of instances per bin: {}".format(counts))
-    return groups
-
-
-class GroupedBatchSampler(BatchSampler):
-    """
-    Wraps another sampler to yield a mini-batch of indices.
-    It enforces that the batch only contain elements from the same group.
-    It also tries to provide mini-batches which follows an ordering which is
-    as close as possible to the ordering from the original sampler.
-    Arguments:
-        sampler (Sampler): Base sampler.
-        group_ids (list[int]): If the sampler produces indices in range [0, N),
-            `group_ids` must be a list of `N` ints which contains the group id of each sample.
-            The group ids must be a continuous set of integers starting from
-            0, i.e. they must be in the range [0, num_groups).
-        batch_size (int): Size of mini-batch.
-    """
-
-    def __init__(self, sampler, group_ids, batch_size):
-        if not isinstance(sampler, Sampler):
-            raise ValueError(
-                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
-            )
-        self.sampler = sampler
-        self.group_ids = group_ids
-        self.batch_size = batch_size
-
-    def __iter__(self):
-        buffer_per_group = defaultdict(list)
-        samples_per_group = defaultdict(list)
-
-        num_batches = 0
-        for idx in self.sampler:
-            group_id = self.group_ids[idx]
-            buffer_per_group[group_id].append(idx)
-            samples_per_group[group_id].append(idx)
-            if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id]  # TODO
-                num_batches += 1
-                del buffer_per_group[group_id]
-            assert len(buffer_per_group[group_id]) < self.batch_size
-
-        # now we have run out of elements that satisfy
-        # the group criteria, let's return the remaining
-        # elements so that the size of the sampler is
-        # deterministic
-        expected_num_batches = len(self)
-        num_remaining = expected_num_batches - num_batches
-        if num_remaining > 0:
-            # for the remaining batches, group the batches by similar lengths
-            batch_idx = []
-            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
-                batch_idx.extend(idxs)
-                if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[: self.batch_size]
-                    batch_idx = batch_idx[self.batch_size :]
-                    num_remaining -= 1
-            if len(batch_idx) > 0:
-                yield batch_idx
-                num_remaining -= 1
-        assert num_remaining == 0
-
-    def __len__(self):
-        """
-        Return the number of mini-batches rather than the number of samples.
-        """
-        return (len(self.sampler) + self.batch_size - 1) // self.batch_size
diff --git a/server/transformers/examples/distillation/lm_seqs_dataset.py b/server/transformers/examples/distillation/lm_seqs_dataset.py
deleted file mode 100644
index 8f444f4e0e151f1342016e86ba60199cebc39dec..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/lm_seqs_dataset.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Dataset to distilled models
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-from utils import logger
-
-
-class LmSeqsDataset(Dataset):
-    """Custom Dataset wrapping language modeling sequences.
-
-    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
-
-    Input:
-    ------
-        params: `NameSpace` parameters
-        data: `List[np.array[int]]
-    """
-
-    def __init__(self, params, data):
-        self.params = params
-
-        self.token_ids = np.array(data)
-        self.lengths = np.array([len(t) for t in data])
-
-        self.check()
-        self.remove_long_sequences()
-        self.remove_empty_sequences()
-        self.remove_unknown_sequences()
-        self.check()
-        self.print_statistics()
-
-    def __getitem__(self, index):
-        return (self.token_ids[index], self.lengths[index])
-
-    def __len__(self):
-        return len(self.lengths)
-
-    def check(self):
-        """
-        Some sanity checks
-        """
-        assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
-
-    def remove_long_sequences(self):
-        """
-        Sequences that are too long are splitted by chunk of max_model_input_size.
-        """
-        max_len = self.params.max_model_input_size
-        indices = self.lengths > max_len
-        logger.info(f"Splitting {sum(indices)} too long sequences.")
-
-        def divide_chunks(l, n):
-            return [l[i : i + n] for i in range(0, len(l), n)]
-
-        new_tok_ids = []
-        new_lengths = []
-        if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
-        else:
-            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
-
-        for seq_, len_ in zip(self.token_ids, self.lengths):
-            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
-            if len_ <= max_len:
-                new_tok_ids.append(seq_)
-                new_lengths.append(len_)
-            else:
-                sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len - 2):
-                    if sub_s[0] != cls_id:
-                        sub_s = np.insert(sub_s, 0, cls_id)
-                    if sub_s[-1] != sep_id:
-                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
-                    assert len(sub_s) <= max_len
-                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
-                    sub_seqs.append(sub_s)
-
-                new_tok_ids.extend(sub_seqs)
-                new_lengths.extend([len(l) for l in sub_seqs])
-
-        self.token_ids = np.array(new_tok_ids)
-        self.lengths = np.array(new_lengths)
-
-    def remove_empty_sequences(self):
-        """
-        Too short sequences are simply removed. This could be tunedd.
-        """
-        init_size = len(self)
-        indices = self.lengths > 11
-        self.token_ids = self.token_ids[indices]
-        self.lengths = self.lengths[indices]
-        new_size = len(self)
-        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
-
-    def remove_unknown_sequences(self):
-        """
-        Remove sequences with a (too) high level of unknown tokens.
-        """
-        if "unk_token" not in self.params.special_tok_ids:
-            return
-        else:
-            unk_token_id = self.params.special_tok_ids["unk_token"]
-        init_size = len(self)
-        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
-        indices = (unk_occs / self.lengths) < 0.5
-        self.token_ids = self.token_ids[indices]
-        self.lengths = self.lengths[indices]
-        new_size = len(self)
-        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
-
-    def print_statistics(self):
-        """
-        Print some statistics on the corpus. Only the master process.
-        """
-        if not self.params.is_master:
-            return
-        logger.info(f"{len(self)} sequences")
-        # data_len = sum(self.lengths)
-        # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
-        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
-
-        # unk_idx = self.params.special_tok_ids['unk_token']
-        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
-        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
-
-    def batch_sequences(self, batch):
-        """
-        Do the padding and transform into torch.tensor.
-        """
-        token_ids = [t[0] for t in batch]
-        lengths = [t[1] for t in batch]
-        assert len(token_ids) == len(lengths)
-
-        # Max for paddings
-        max_seq_len_ = max(lengths)
-
-        # Pad token ids
-        if self.params.mlm:
-            pad_idx = self.params.special_tok_ids["pad_token"]
-        else:
-            pad_idx = self.params.special_tok_ids["unk_token"]
-        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
-        assert len(tk_) == len(token_ids)
-        assert all(len(t) == max_seq_len_ for t in tk_)
-
-        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
-        lg_t = torch.tensor(lengths)  # (bs)
-        return tk_t, lg_t
diff --git a/server/transformers/examples/distillation/requirements.txt b/server/transformers/examples/distillation/requirements.txt
deleted file mode 100644
index 1f1a1b8a6e1485772d1ed1d46aff415555de0e18..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-
-gitpython==3.0.2
-tensorboard>=1.14.0
-tensorboardX==1.8
-psutil==5.6.3
-scipy==1.3.1
diff --git a/server/transformers/examples/distillation/run_squad_w_distillation.py b/server/transformers/examples/distillation/run_squad_w_distillation.py
deleted file mode 100644
index 4900f19ead6915215ac32edaf87935ee6e5e9afc..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/run_squad_w_distillation.py
+++ /dev/null
@@ -1,864 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" This is the exact same script as `examples/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
-
-import argparse
-import glob
-import logging
-import os
-import random
-import timeit
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForQuestionAnswering,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
-    XLMConfig,
-    XLMForQuestionAnswering,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-    squad_convert_examples_to_features,
-)
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_log_probs,
-    compute_predictions_logits,
-    squad_evaluate,
-)
-from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def to_list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def train(args, train_dataset, model, tokenizer, teacher=None):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 1
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to gobal_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    # Added here for reproductibility
-    set_seed(args)
-
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            if teacher is not None:
-                teacher.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-                if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
-            outputs = model(**inputs)
-            loss, start_logits_stu, end_logits_stu = outputs
-
-            # Distillation loss
-            if teacher is not None:
-                if "token_type_ids" not in inputs:
-                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
-                with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(
-                        input_ids=inputs["input_ids"],
-                        token_type_ids=inputs["token_type_ids"],
-                        attention_mask=inputs["attention_mask"],
-                    )
-                assert start_logits_tea.size() == start_logits_stu.size()
-                assert end_logits_tea.size() == end_logits_stu.size()
-
-                loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = loss_fct(
-                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    F.softmax(start_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
-                loss_end = loss_fct(
-                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    F.softmax(end_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
-                loss_ce = (loss_start + loss_end) / 2.0
-
-                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                # Log metrics
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Only evaluate when single GPU otherwise metrics may not average well
-                    if args.local_rank == -1 and args.evaluate_during_training:
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    all_results = []
-    start_time = timeit.default_timer()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
-            example_indices = batch[3]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-
-            outputs = model(**inputs)
-
-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-
-            output = [to_list(output[i]) for output in outputs]
-
-            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-            # models only use two.
-            if len(output) >= 5:
-                start_logits = output[0]
-                start_top_index = output[1]
-                end_logits = output[2]
-                end_top_index = output[3]
-                cls_logits = output[4]
-
-                result = SquadResult(
-                    unique_id,
-                    start_logits,
-                    end_logits,
-                    start_top_index=start_top_index,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-            else:
-                start_logits, end_logits = output
-                result = SquadResult(unique_id, start_logits, end_logits)
-
-            all_results.append(result)
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    else:
-        output_null_log_odds_file = None
-
-    if args.model_type in ["xlnet", "xlm"]:
-        # XLNet uses a more complex post-processing procedure
-        predictions = compute_predictions_log_probs(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            model.config.start_n_top,
-            model.config.end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        predictions = compute_predictions_logits(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-            tokenizer,
-        )
-
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(
-        os.path.dirname(input_file),
-        "cached_distillation_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file)
-
-        try:
-            features, dataset, examples = (
-                features_and_dataset["features"],
-                features_and_dataset["dataset"],
-                features_and_dataset["examples"],
-            )
-        except KeyError:
-            raise DeprecationWarning(
-                "You seem to be loading features from an older version of this script please delete the "
-                "file %s in order for it to be created again" % cached_features_file
-            )
-    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-        if evaluate:
-            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
-        else:
-            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=args.threads,
-        )
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Distillation parameters (optional)
-    parser.add_argument(
-        "--teacher_type",
-        default=None,
-        type=str,
-        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
-    )
-    parser.add_argument(
-        "--teacher_name_or_path",
-        default=None,
-        type=str,
-        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
-    )
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        help="The input data dir. Should contain the .json files for the task."
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--train_file",
-        default=None,
-        type=str,
-        help="The input training file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        help="The input evaluation file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-        "longer than this will be truncated, and sequences shorter than this will be padded.",
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help="The maximum number of tokens for the question. Questions longer than this will "
-        "be truncated to this length.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help="The maximum length of an answer that can be generated. This is needed because the start "
-        "and end predictions are not conditioned on one another.",
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help="If true, all of the warnings related to data processing will be printed. "
-        "A number of warnings are expected for a normal SQuAD evaluation.",
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.teacher_type is not None:
-        assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.0
-        assert args.alpha_ce + args.alpha_squad > 0.0
-        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
-        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(
-            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
-        )
-        teacher = teacher_model_class.from_pretrained(
-            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
-        )
-        teacher.to(args.device)
-    else:
-        teacher = None
-
-    if args.local_rank == 0:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
-    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
-    # remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            logger.info("Loading checkpoints saved during training for evaluation")
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/distillation/scripts/binarized_data.py b/server/transformers/examples/distillation/scripts/binarized_data.py
deleted file mode 100644
index 7590cfcbcf97956010fea877402f87d936717690..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/scripts/binarized_data.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before distillation.
-"""
-import argparse
-import logging
-import pickle
-import random
-import time
-
-import numpy as np
-
-from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
-    )
-    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
-    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
-    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
-    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
-    args = parser.parse_args()
-
-    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
-    if args.tokenizer_type == "bert":
-        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
-        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
-    elif args.tokenizer_type == "roberta":
-        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
-        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
-    elif args.tokenizer_type == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`
-
-    logger.info(f"Loading text from {args.file_path}")
-    with open(args.file_path, "r", encoding="utf8") as fp:
-        data = fp.readlines()
-
-    logger.info(f"Start encoding")
-    logger.info(f"{len(data)} examples to process.")
-
-    rslt = []
-    iter = 0
-    interval = 10000
-    start = time.time()
-    for text in data:
-        text = f"{bos} {text.strip()} {sep}"
-        token_ids = tokenizer.encode(text, add_special_tokens=False)
-        rslt.append(token_ids)
-
-        iter += 1
-        if iter % interval == 0:
-            end = time.time()
-            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
-            start = time.time()
-    logger.info("Finished binarization")
-    logger.info(f"{len(data)} examples processed.")
-
-    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
-    rslt_ = [np.uint16(d) for d in rslt]
-    random.shuffle(rslt_)
-    logger.info(f"Dump to {dp_file}")
-    with open(dp_file, "wb") as handle:
-        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/distillation/scripts/extract.py b/server/transformers/examples/distillation/scripts/extract.py
deleted file mode 100644
index 8d102c0cda8f23cafbfcd05a214791544d8aea99..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/scripts/extract.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training the distilled model.
-Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
-"""
-import argparse
-
-import torch
-
-from transformers import GPT2LMHeadModel, RobertaForMaskedLM
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
-    )
-    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default="roberta-large", type=str)
-    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
-    parser.add_argument("--vocab_transform", action="store_true")
-    args = parser.parse_args()
-
-    if args.model_type == "roberta":
-        model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = "roberta"
-    elif args.model_type == "gpt2":
-        model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = "transformer"
-
-    state_dict = model.state_dict()
-    compressed_sd = {}
-
-    # Embeddings #
-    if args.model_type == "gpt2":
-        for param_name in ["wte.weight", "wpe.weight"]:
-            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
-    else:
-        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
-            param_name = f"{prefix}.embeddings.{w}.weight"
-            compressed_sd[param_name] = state_dict[param_name]
-        for w in ["weight", "bias"]:
-            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
-            compressed_sd[param_name] = state_dict[param_name]
-
-    # Transformer Blocks #
-    std_idx = 0
-    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == "gpt2":
-            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
-                for w in ["weight", "bias"]:
-                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
-                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
-                    ]
-            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
-        else:
-            for layer in [
-                "attention.self.query",
-                "attention.self.key",
-                "attention.self.value",
-                "attention.output.dense",
-                "attention.output.LayerNorm",
-                "intermediate.dense",
-                "output.dense",
-                "output.LayerNorm",
-            ]:
-                for w in ["weight", "bias"]:
-                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
-                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
-                    ]
-        std_idx += 1
-
-    # Language Modeling Head ###s
-    if args.model_type == "roberta":
-        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
-            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
-        if args.vocab_transform:
-            for w in ["weight", "bias"]:
-                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
-                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
-    elif args.model_type == "gpt2":
-        for w in ["weight", "bias"]:
-            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
-
-    print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
-
-    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
-    torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/server/transformers/examples/distillation/scripts/extract_distilbert.py b/server/transformers/examples/distillation/scripts/extract_distilbert.py
deleted file mode 100644
index 972418b56b80bb1e7d2d8f71950bd3654079da31..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/scripts/extract_distilbert.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training DistilBERT.
-Specific to BERT -> DistilBERT.
-"""
-import argparse
-
-import torch
-
-from transformers import BertForMaskedLM
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
-    )
-    parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
-    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
-    parser.add_argument("--vocab_transform", action="store_true")
-    args = parser.parse_args()
-
-    if args.model_type == "bert":
-        model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = "bert"
-    else:
-        raise ValueError(f'args.model_type should be "bert".')
-
-    state_dict = model.state_dict()
-    compressed_sd = {}
-
-    for w in ["word_embeddings", "position_embeddings"]:
-        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
-    for w in ["weight", "bias"]:
-        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
-
-    std_idx = 0
-    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ["weight", "bias"]:
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
-            ]
-
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
-            ]
-
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
-            ]
-        std_idx += 1
-
-    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
-    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
-    if args.vocab_transform:
-        for w in ["weight", "bias"]:
-            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
-            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
-
-    print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
-
-    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
-    torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/server/transformers/examples/distillation/scripts/token_counts.py b/server/transformers/examples/distillation/scripts/token_counts.py
deleted file mode 100644
index 0238bf66f865be5d32bff6783a8cb048563adc2b..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/scripts/token_counts.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training the distilled model.
-"""
-import argparse
-import logging
-import pickle
-from collections import Counter
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
-    )
-    parser.add_argument(
-        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
-    )
-    parser.add_argument(
-        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
-    )
-    parser.add_argument("--vocab_size", default=30522, type=int)
-    args = parser.parse_args()
-
-    logger.info(f"Loading data from {args.data_file}")
-    with open(args.data_file, "rb") as fp:
-        data = pickle.load(fp)
-
-    logger.info("Counting occurences for MLM.")
-    counter = Counter()
-    for tk_ids in data:
-        counter.update(tk_ids)
-    counts = [0] * args.vocab_size
-    for k, v in counter.items():
-        counts[k] = v
-
-    logger.info(f"Dump to {args.token_counts_dump}")
-    with open(args.token_counts_dump, "wb") as handle:
-        pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/server/transformers/examples/distillation/train.py b/server/transformers/examples/distillation/train.py
deleted file mode 100644
index 670d03ea16edf345e5f2a60b16988a8d3fffde6c..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/train.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training the distilled model.
-Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
-"""
-import argparse
-import json
-import os
-import pickle
-import shutil
-
-import numpy as np
-import torch
-
-from distiller import Distiller
-from lm_seqs_dataset import LmSeqsDataset
-from transformers import (
-    BertConfig,
-    BertForMaskedLM,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForMaskedLM,
-    DistilBertTokenizer,
-    GPT2Config,
-    GPT2LMHeadModel,
-    GPT2Tokenizer,
-    RobertaConfig,
-    RobertaForMaskedLM,
-    RobertaTokenizer,
-)
-from utils import git_log, init_gpu_params, logger, set_seed
-
-
-MODEL_CLASSES = {
-    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
-    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-}
-
-
-def sanity_checks(args):
-    """
-    A bunch of args sanity checks to perform even starting...
-    """
-    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
-    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
-    if args.mlm:
-        assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
-    else:
-        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
-
-    assert args.teacher_type == args.student_type or (
-        args.student_type == "distilbert" and args.teacher_type == "bert"
-    )
-    assert os.path.isfile(args.student_config)
-    if args.student_pretrained_weights is not None:
-        assert os.path.isfile(args.student_pretrained_weights)
-
-    if args.freeze_token_type_embds:
-        assert args.student_type in ["roberta"]
-
-    assert args.alpha_ce >= 0.0
-    assert args.alpha_mlm >= 0.0
-    assert args.alpha_clm >= 0.0
-    assert args.alpha_mse >= 0.0
-    assert args.alpha_cos >= 0.0
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
-
-
-def freeze_pos_embeddings(student, args):
-    if args.student_type == "roberta":
-        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == "gpt2":
-        student.transformer.wpe.weight.requires_grad = False
-
-
-def freeze_token_type_embeddings(student, args):
-    if args.student_type == "roberta":
-        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
-
-    parser.add_argument(
-        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
-    )
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        required=True,
-        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
-    )
-
-    parser.add_argument(
-        "--student_type",
-        type=str,
-        choices=["distilbert", "roberta", "gpt2"],
-        required=True,
-        help="The student type (DistilBERT, RoBERTa).",
-    )
-    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
-    parser.add_argument(
-        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
-    )
-
-    parser.add_argument(
-        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
-    )
-    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
-
-    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
-    )
-    parser.add_argument(
-        "--alpha_mlm",
-        default=0.0,
-        type=float,
-        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
-    )
-    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument(
-        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
-    )
-
-    parser.add_argument(
-        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
-    )
-    parser.add_argument(
-        "--mlm_mask_prop",
-        default=0.15,
-        type=float,
-        help="Proportion of tokens for which we need to make a prediction.",
-    )
-    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
-    parser.add_argument(
-        "--mlm_smoothing",
-        default=0.7,
-        type=float,
-        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
-    )
-    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
-
-    parser.add_argument(
-        "--restrict_ce_to_mask",
-        action="store_true",
-        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
-    )
-    parser.add_argument(
-        "--freeze_pos_embs",
-        action="store_true",
-        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
-    )
-    parser.add_argument(
-        "--freeze_token_type_embds",
-        action="store_true",
-        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
-    )
-
-    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
-    parser.add_argument(
-        "--group_by_size",
-        action="store_false",
-        help="If true, group sequences that have similar length into the same batch. Default is true.",
-    )
-
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=50,
-        help="Gradient accumulation for larger training batches.",
-    )
-    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56, help="Random seed")
-
-    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
-    args = parser.parse_args()
-    sanity_checks(args)
-
-    # ARGS #
-    init_gpu_params(args)
-    set_seed(args)
-    if args.is_master:
-        if os.path.exists(args.dump_path):
-            if not args.force:
-                raise ValueError(
-                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
-                    "Use `--force` if you want to overwrite it"
-                )
-            else:
-                shutil.rmtree(args.dump_path)
-
-        if not os.path.exists(args.dump_path):
-            os.makedirs(args.dump_path)
-        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
-
-        # SAVE PARAMS #
-        logger.info(f"Param: {args}")
-        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
-            json.dump(vars(args), f, indent=4)
-        git_log(args.dump_path)
-
-    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
-    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
-
-    # TOKENIZER #
-    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
-    special_tok_ids = {}
-    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
-        idx = tokenizer.all_special_tokens.index(tok_symbol)
-        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f"Special tokens {special_tok_ids}")
-    args.special_tok_ids = special_tok_ids
-    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
-
-    # DATA LOADER #
-    logger.info(f"Loading data from {args.data_file}")
-    with open(args.data_file, "rb") as fp:
-        data = pickle.load(fp)
-
-    if args.mlm:
-        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
-        with open(args.token_counts, "rb") as fp:
-            counts = pickle.load(fp)
-
-        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
-        for idx in special_tok_ids.values():
-            token_probs[idx] = 0.0  # do not predict special tokens
-        token_probs = torch.from_numpy(token_probs)
-    else:
-        token_probs = None
-
-    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f"Data loader created.")
-
-    # STUDENT #
-    logger.info(f"Loading student config from {args.student_config}")
-    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
-    stu_architecture_config.output_hidden_states = True
-
-    if args.student_pretrained_weights is not None:
-        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
-        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
-    else:
-        student = student_model_class(stu_architecture_config)
-
-    if args.n_gpu > 0:
-        student.to(f"cuda:{args.local_rank}")
-    logger.info(f"Student loaded.")
-
-    # TEACHER #
-    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
-    if args.n_gpu > 0:
-        teacher.to(f"cuda:{args.local_rank}")
-    logger.info(f"Teacher loaded from {args.teacher_name}.")
-
-    # FREEZING #
-    if args.freeze_pos_embs:
-        freeze_pos_embeddings(student, args)
-    if args.freeze_token_type_embds:
-        freeze_token_type_embeddings(student, args)
-
-    # SANITY CHECKS #
-    assert student.config.vocab_size == teacher.config.vocab_size
-    assert student.config.hidden_size == teacher.config.hidden_size
-    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
-    if args.mlm:
-        assert token_probs.size(0) == stu_architecture_config.vocab_size
-
-    # DISTILLER #
-    torch.cuda.empty_cache()
-    distiller = Distiller(
-        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
-    )
-    distiller.train()
-    logger.info("Let's go get some drinks.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/distillation/training_configs/distilbert-base-multilingual-cased.json b/server/transformers/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
deleted file mode 100644
index f76e7febcba536f7ee6137e70ffca0acae649bea..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-	"activation": "gelu",
-	"attention_dropout": 0.1,
-	"dim": 768,
-	"dropout": 0.1,
-	"hidden_dim": 3072,
-	"initializer_range": 0.02,
-	"max_position_embeddings": 512,
-	"n_heads": 12,
-	"n_layers": 6,
-	"sinusoidal_pos_embds": true,
-	"tie_weights_": true,
-	"vocab_size": 119547
-  }
-  
\ No newline at end of file
diff --git a/server/transformers/examples/distillation/training_configs/distilbert-base-uncased.json b/server/transformers/examples/distillation/training_configs/distilbert-base-uncased.json
deleted file mode 100644
index 15d1e7fe00e63100b602a0d7db0cdbf16f7e6ff0..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/training_configs/distilbert-base-uncased.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-	"activation": "gelu",
-	"attention_dropout": 0.1,
-	"dim": 768,
-	"dropout": 0.1,
-	"hidden_dim": 3072,
-	"initializer_range": 0.02,
-	"max_position_embeddings": 512,
-	"n_heads": 12,
-	"n_layers": 6,
-	"sinusoidal_pos_embds": true,
-	"tie_weights_": true,
-	"vocab_size": 30522
-  }
-  
\ No newline at end of file
diff --git a/server/transformers/examples/distillation/training_configs/distilgpt2.json b/server/transformers/examples/distillation/training_configs/distilgpt2.json
deleted file mode 100644
index 8616e8e60fd522461462444f81f7259fe904f104..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/training_configs/distilgpt2.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-	"initializer_range": 0.02,
-	"layer_norm_epsilon": 0.00001,
-	"n_ctx": 1024,
-	"n_embd": 768,
-	"n_head": 12,
-	"n_layer": 6,
-	"n_positions": 1024,
-	"vocab_size": 50257
-}
\ No newline at end of file
diff --git a/server/transformers/examples/distillation/training_configs/distilroberta-base.json b/server/transformers/examples/distillation/training_configs/distilroberta-base.json
deleted file mode 100644
index 2d90ef6380a0e4d54dbab8b1a151f7162665c0da..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/training_configs/distilroberta-base.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "vocab_size": 50265,
-    "hidden_size": 768,
-    "num_hidden_layers": 6,
-    "num_attention_heads": 12,
-    "intermediate_size": 3072,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.1,
-    "attention_probs_dropout_prob": 0.1,
-    "max_position_embeddings": 514,
-    "type_vocab_size": 1,
-    "initializer_range": 0.02,
-    "layer_norm_eps": 0.00001
-}
\ No newline at end of file
diff --git a/server/transformers/examples/distillation/utils.py b/server/transformers/examples/distillation/utils.py
deleted file mode 100644
index 211e7c61dacf1c252104cb9f67759ca5e29cf23c..0000000000000000000000000000000000000000
--- a/server/transformers/examples/distillation/utils.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Utils to train DistilBERT
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-import json
-import logging
-import os
-import socket
-
-import git
-import numpy as np
-import torch
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def git_log(folder_path: str):
-    """
-    Log commit info.
-    """
-    repo = git.Repo(search_parent_directories=True)
-    repo_infos = {
-        "repo_id": str(repo),
-        "repo_sha": str(repo.head.object.hexsha),
-        "repo_branch": str(repo.active_branch),
-    }
-
-    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
-        json.dump(repo_infos, f, indent=4)
-
-
-def init_gpu_params(params):
-    """
-    Handle single and multi-GPU / multi-node.
-    """
-    if params.n_gpu <= 0:
-        params.local_rank = 0
-        params.master_port = -1
-        params.is_master = True
-        params.multi_gpu = False
-        return
-
-    assert torch.cuda.is_available()
-
-    logger.info("Initializing GPUs")
-    if params.n_gpu > 1:
-        assert params.local_rank != -1
-
-        params.world_size = int(os.environ["WORLD_SIZE"])
-        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
-        params.global_rank = int(os.environ["RANK"])
-
-        # number of nodes / node ID
-        params.n_nodes = params.world_size // params.n_gpu_per_node
-        params.node_id = params.global_rank // params.n_gpu_per_node
-        params.multi_gpu = True
-
-        assert params.n_nodes == int(os.environ["N_NODES"])
-        assert params.node_id == int(os.environ["NODE_RANK"])
-
-    # local job (single GPU)
-    else:
-        assert params.local_rank == -1
-
-        params.n_nodes = 1
-        params.node_id = 0
-        params.local_rank = 0
-        params.global_rank = 0
-        params.world_size = 1
-        params.n_gpu_per_node = 1
-        params.multi_gpu = False
-
-    # sanity checks
-    assert params.n_nodes >= 1
-    assert 0 <= params.node_id < params.n_nodes
-    assert 0 <= params.local_rank <= params.global_rank < params.world_size
-    assert params.world_size == params.n_nodes * params.n_gpu_per_node
-
-    # define whether this is the master process / if we are in multi-node distributed mode
-    params.is_master = params.node_id == 0 and params.local_rank == 0
-    params.multi_node = params.n_nodes > 1
-
-    # summary
-    PREFIX = f"--- Global rank: {params.global_rank} - "
-    logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
-    logger.info(PREFIX + "Node ID        : %i" % params.node_id)
-    logger.info(PREFIX + "Local rank     : %i" % params.local_rank)
-    logger.info(PREFIX + "World size     : %i" % params.world_size)
-    logger.info(PREFIX + "GPUs per node  : %i" % params.n_gpu_per_node)
-    logger.info(PREFIX + "Master         : %s" % str(params.is_master))
-    logger.info(PREFIX + "Multi-node     : %s" % str(params.multi_node))
-    logger.info(PREFIX + "Multi-GPU      : %s" % str(params.multi_gpu))
-    logger.info(PREFIX + "Hostname       : %s" % socket.gethostname())
-
-    # set GPU device
-    torch.cuda.set_device(params.local_rank)
-
-    # initialize multi-GPU
-    if params.multi_gpu:
-        logger.info("Initializing PyTorch distributed")
-        torch.distributed.init_process_group(
-            init_method="env://", backend="nccl",
-        )
-
-
-def set_seed(args):
-    """
-    Set the random seed.
-    """
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
diff --git a/server/transformers/examples/hans/hans_processors.py b/server/transformers/examples/hans/hans_processors.py
deleted file mode 100644
index ff75a0acd18c5da6d5da08ea20d603753bc0ff80..0000000000000000000000000000000000000000
--- a/server/transformers/examples/hans/hans_processors.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" GLUE processors and helpers """
-
-import logging
-import os
-
-from transformers.file_utils import is_tf_available
-from utils_hans import DataProcessor, InputExample, InputFeatures
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-def hans_convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_length=512,
-    task=None,
-    label_list=None,
-    output_mode=None,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """
-    Loads a data file into a list of ``InputFeatures``
-
-    Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples
-        max_length: Maximum example length
-        task: HANS
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
-        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
-        pad_token: Padding token
-        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
-        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
-            actual values)
-
-    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-        a list of task-specific ``InputFeatures`` which can be fed to the model.
-
-    """
-    is_tf_dataset = False
-    if is_tf_available() and isinstance(examples, tf.data.Dataset):
-        is_tf_dataset = True
-
-    if task is not None:
-        processor = glue_processors[task]()
-        if label_list is None:
-            label_list = processor.get_labels()
-            logger.info("Using label list %s for task %s" % (label_list, task))
-        if output_mode is None:
-            output_mode = glue_output_modes[task]
-            logger.info("Using output mode %s for task %s" % (output_mode, task))
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d" % (ex_index))
-        if is_tf_dataset:
-            example = processor.get_example_from_tensor_dict(example)
-            example = processor.tfds_map(example)
-
-        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
-        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
-            len(attention_mask), max_length
-        )
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
-            len(token_type_ids), max_length
-        )
-
-        if output_mode == "classification":
-            label = label_map[example.label] if example.label in label_map else 0
-        elif output_mode == "regression":
-            label = float(example.label)
-        else:
-            raise KeyError(output_mode)
-        pairID = str(example.pairID)
-
-        if ex_index < 10:
-            logger.info("*** Example ***")
-            logger.info("text_a: %s" % (example.text_a))
-            logger.info("text_b: %s" % (example.text_b))
-            logger.info("guid: %s" % (example.guid))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
-            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label))
-
-        features.append(
-            InputFeatures(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                label=label,
-                pairID=pairID,
-            )
-        )
-
-    if is_tf_available() and is_tf_dataset:
-
-        def gen():
-            for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    ex.label,
-                )
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                tf.TensorShape([]),
-            ),
-        )
-
-    return features
-
-
-class HansProcessor(DataProcessor):
-    """Processor for the HANS data set."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["premise"].numpy().decode("utf-8"),
-            tensor_dict["hypothesis"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[5]
-            text_b = line[6]
-            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
-        return examples
-
-
-glue_tasks_num_labels = {
-    "hans": 3,
-}
-
-glue_processors = {
-    "hans": HansProcessor,
-}
-
-glue_output_modes = {
-    "hans": "classification",
-}
diff --git a/server/transformers/examples/hans/test_hans.py b/server/transformers/examples/hans/test_hans.py
deleted file mode 100644
index 40c2a1bd3a1e015213bec1e0418ca9ac5d42ba3d..0000000000000000000000000000000000000000
--- a/server/transformers/examples/hans/test_hans.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from hans_processors import glue_output_modes as output_modes
-from hans_processors import glue_processors as processors
-from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AlbertConfig,
-    AlbertForSequenceClassification,
-    AlbertTokenizer,
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForSequenceClassification,
-    DistilBertTokenizer,
-    RobertaConfig,
-    RobertaForSequenceClassification,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMForSequenceClassification,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForSequenceClassification,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
-                    logs["loss"] = loss_scalar
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    # print(json.dumps({**logs, **{'step': global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "xlnet"] else None
-                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-                pair_ids = batch[4].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-                pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif args.output_mode == "regression":
-            preds = np.squeeze(preds)
-
-        output_eval_file = os.path.join(eval_output_dir, "hans_predictions.txt")
-        with open(output_eval_file, "w") as writer:
-            writer.write("pairID,gld_label\n")
-            for pid, pred in zip(pair_ids, preds):
-                writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-
-    label_list = processor.get_labels()
-
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
-            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = (
-            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-    all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
-    return dataset, label_list
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/hans/utils_hans.py b/server/transformers/examples/hans/utils_hans.py
deleted file mode 100644
index 8d0b42165caff48c66e85799c235a2d94647366e..0000000000000000000000000000000000000000
--- a/server/transformers/examples/hans/utils_hans.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import csv
-import json
-
-
-class InputExample(object):
-    """
-    A single training/test example for simple sequence classification.
-
-    Args:
-        guid: Unique id for the example.
-        text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-        text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-        self.pairID = pairID
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-
-class InputFeatures(object):
-    """
-    A single set of features of data.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        label: Label corresponding to the input
-    """
-
-    def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.label = label
-        self.pairID = pairID
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                lines.append(line)
-            return lines
diff --git a/server/transformers/examples/mm-imdb/run_mmimdb.py b/server/transformers/examples/mm-imdb/run_mmimdb.py
deleted file mode 100644
index c7e9f7b47e0226cff61d0a01de7d4a2365021f70..0000000000000000000000000000000000000000
--- a/server/transformers/examples/mm-imdb/run_mmimdb.py
+++ /dev/null
@@ -1,614 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
-
-
-import argparse
-import glob
-import json
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-import torch.nn as nn
-from sklearn.metrics import f1_score
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    BertConfig,
-    BertModel,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
-    MMBTConfig,
-    MMBTForClassification,
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMModel,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetModel,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertModel, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer, criterion):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset,
-        sampler=train_sampler,
-        batch_size=args.train_batch_size,
-        collate_fn=collate_fn,
-        num_workers=args.num_workers,
-    )
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    best_f1, n_no_improve = 0, 0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            labels = batch[5]
-            inputs = {
-                "input_ids": batch[0],
-                "input_modal": batch[2],
-                "attention_mask": batch[1],
-                "modal_start_tokens": batch[3],
-                "modal_end_tokens": batch[4],
-            }
-            outputs = model(**inputs)
-            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
-            loss = criterion(logits, labels)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, criterion)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
-                    logs["loss"] = loss_scalar
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{"step": global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-        if args.local_rank == -1:
-            results = evaluate(args, model, tokenizer, criterion)
-            if results["micro_f1"] > best_f1:
-                best_f1 = results["micro_f1"]
-                n_no_improve = 0
-            else:
-                n_no_improve += 1
-
-            if n_no_improve > args.patience:
-                train_iterator.close()
-                break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, criterion, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_output_dir = args.output_dir
-    eval_dataset = load_examples(args, tokenizer, evaluate=True)
-
-    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
-    )
-
-    # multi-gpu eval
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    preds = None
-    out_label_ids = None
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            batch = tuple(t.to(args.device) for t in batch)
-            labels = batch[5]
-            inputs = {
-                "input_ids": batch[0],
-                "input_modal": batch[2],
-                "attention_mask": batch[1],
-                "modal_start_tokens": batch[3],
-                "modal_end_tokens": batch[4],
-            }
-            outputs = model(**inputs)
-            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
-            tmp_eval_loss = criterion(logits, labels)
-            eval_loss += tmp_eval_loss.mean().item()
-        nb_eval_steps += 1
-        if preds is None:
-            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
-            out_label_ids = labels.detach().cpu().numpy()
-        else:
-            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
-            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    result = {
-        "loss": eval_loss,
-        "macro_f1": f1_score(out_label_ids, preds, average="macro"),
-        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
-    }
-
-    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def load_examples(args, tokenizer, evaluate=False):
-    path = os.path.join(args.data_dir, "dev.jsonl" if evaluate else "train.jsonl")
-    transforms = get_image_transforms()
-    labels = get_mmimdb_labels()
-    dataset = JsonlDataset(path, tokenizer, transforms, labels, args.max_seq_length - args.num_image_embeds - 2)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    # Setup model
-    labels = get_mmimdb_labels()
-    num_labels = len(labels)
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    transformer = model_class.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
-    )
-    img_encoder = ImageEncoder(args)
-    config = MMBTConfig(transformer_config, num_labels=num_labels)
-    model = MMBTForClassification(config, transformer, img_encoder)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_examples(args, tokenizer, evaluate=False)
-        label_frequences = train_dataset.get_label_frequencies()
-        label_frequences = [label_frequences[l] for l in labels]
-        label_weights = (
-            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
-        ) ** -1
-        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = MMBTForClassification(config, transformer, img_encoder)
-        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-            model = MMBTForClassification(config, transformer, img_encoder)
-            model.load_state_dict(torch.load(checkpoint))
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/mm-imdb/utils_mmimdb.py b/server/transformers/examples/mm-imdb/utils_mmimdb.py
deleted file mode 100644
index 5df0a886eca0ec0f98e8f1224e8772485df8650f..0000000000000000000000000000000000000000
--- a/server/transformers/examples/mm-imdb/utils_mmimdb.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from collections import Counter
-
-import torch
-import torch.nn as nn
-import torchvision
-import torchvision.transforms as transforms
-from PIL import Image
-from torch.utils.data import Dataset
-
-
-POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
-
-
-class ImageEncoder(nn.Module):
-    def __init__(self, args):
-        super().__init__()
-        model = torchvision.models.resnet152(pretrained=True)
-        modules = list(model.children())[:-2]
-        self.model = nn.Sequential(*modules)
-        self.pool = nn.AdaptiveAvgPool2d(POOLING_BREAKDOWN[args.num_image_embeds])
-
-    def forward(self, x):
-        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
-        out = self.pool(self.model(x))
-        out = torch.flatten(out, start_dim=2)
-        out = out.transpose(1, 2).contiguous()
-        return out  # BxNx2048
-
-
-class JsonlDataset(Dataset):
-    def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
-        self.data = [json.loads(l) for l in open(data_path)]
-        self.data_dir = os.path.dirname(data_path)
-        self.tokenizer = tokenizer
-        self.labels = labels
-        self.n_classes = len(labels)
-        self.max_seq_length = max_seq_length
-
-        self.transforms = transforms
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, index):
-        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
-        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
-        sentence = sentence[: self.max_seq_length]
-
-        label = torch.zeros(self.n_classes)
-        label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
-
-        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
-        image = self.transforms(image)
-
-        return {
-            "image_start_token": start_token,
-            "image_end_token": end_token,
-            "sentence": sentence,
-            "image": image,
-            "label": label,
-        }
-
-    def get_label_frequencies(self):
-        label_freqs = Counter()
-        for row in self.data:
-            label_freqs.update(row["label"])
-        return label_freqs
-
-
-def collate_fn(batch):
-    lens = [len(row["sentence"]) for row in batch]
-    bsz, max_seq_len = len(batch), max(lens)
-
-    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
-    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
-
-    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
-        text_tensor[i_batch, :length] = input_row["sentence"]
-        mask_tensor[i_batch, :length] = 1
-
-    img_tensor = torch.stack([row["image"] for row in batch])
-    tgt_tensor = torch.stack([row["label"] for row in batch])
-    img_start_token = torch.stack([row["image_start_token"] for row in batch])
-    img_end_token = torch.stack([row["image_end_token"] for row in batch])
-
-    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor
-
-
-def get_mmimdb_labels():
-    return [
-        "Crime",
-        "Drama",
-        "Thriller",
-        "Action",
-        "Comedy",
-        "Romance",
-        "Documentary",
-        "Short",
-        "Mystery",
-        "History",
-        "Family",
-        "Adventure",
-        "Fantasy",
-        "Sci-Fi",
-        "Western",
-        "Horror",
-        "Sport",
-        "War",
-        "Music",
-        "Musical",
-        "Animation",
-        "Biography",
-        "Film-Noir",
-    ]
-
-
-def get_image_transforms():
-    return transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
-        ]
-    )
diff --git a/server/transformers/examples/pplm/README.md b/server/transformers/examples/pplm/README.md
deleted file mode 100644
index ed105f95cf42a3f7b19624b1c478d9caba56c6ab..0000000000000000000000000000000000000000
--- a/server/transformers/examples/pplm/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
-
-Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
-
-This folder contains the original code used to run the Plug and Play Language Model (PPLM).
-
-Paper link: https://arxiv.org/abs/1912.02164
-
-Blog link: https://eng.uber.com/pplm
-
-Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
-
-
-## Setup
-
-```bash
-git clone https://github.com/huggingface/transformers && cd transformers
-pip install .
-pip install nltk torchtext # additional requirements.
-cd examples/pplm
-```
-
-## PPLM-BoW 
-
-### Example command for bag-of-words control
-
-```bash
-python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
-```
-
-### Tuning hyperparameters for bag-of-words control
-
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
-
-2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
-	a) Reduce the `--stepsize` </br>
-	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
-	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
-
-
-## PPLM-Discrim
-
-### Example command for discriminator based sentiment control
-
-```bash
-python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
-```
-
-### Tuning hyperparameters for discriminator control
-
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
-
-2. Use `--class_label 3` for negative, and `--class_label 2` for positive
-
diff --git a/server/transformers/examples/pplm/imgs/headfigure.png b/server/transformers/examples/pplm/imgs/headfigure.png
deleted file mode 100644
index f4c11ad54d10b300e2051ef6ba2d209447bc92e4..0000000000000000000000000000000000000000
Binary files a/server/transformers/examples/pplm/imgs/headfigure.png and /dev/null differ
diff --git a/server/transformers/examples/pplm/imgs/wooly.png b/server/transformers/examples/pplm/imgs/wooly.png
deleted file mode 100644
index 190d3afd49f1795245772a5d8b81a50b821d17b4..0000000000000000000000000000000000000000
Binary files a/server/transformers/examples/pplm/imgs/wooly.png and /dev/null differ
diff --git a/server/transformers/examples/pplm/pplm_classification_head.py b/server/transformers/examples/pplm/pplm_classification_head.py
deleted file mode 100644
index e85ba608b225c5489aa26481fa04c0f626dabfce..0000000000000000000000000000000000000000
--- a/server/transformers/examples/pplm/pplm_classification_head.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-
-
-class ClassificationHead(torch.nn.Module):
-    """Classification Head for  transformer encoders"""
-
-    def __init__(self, class_size, embed_size):
-        super().__init__()
-        self.class_size = class_size
-        self.embed_size = embed_size
-        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
-        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
-        self.mlp = torch.nn.Linear(embed_size, class_size)
-
-    def forward(self, hidden_state):
-        # hidden_state = F.relu(self.mlp1(hidden_state))
-        # hidden_state = self.mlp2(hidden_state)
-        logits = self.mlp(hidden_state)
-        return logits
diff --git a/server/transformers/examples/pplm/run_pplm.py b/server/transformers/examples/pplm/run_pplm.py
deleted file mode 100644
index b334a0098cc913393c51ed06a16e8209422c4b81..0000000000000000000000000000000000000000
--- a/server/transformers/examples/pplm/run_pplm.py
+++ /dev/null
@@ -1,794 +0,0 @@
-#! /usr/bin/env python3
-# coding=utf-8
-
-# Copyright (c) 2019 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Example command with bag of words:
-python examples/run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
-
-Example command with discriminator:
-python examples/run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
-"""
-
-import argparse
-import json
-from operator import add
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch.autograd import Variable
-from tqdm import trange
-
-from pplm_classification_head import ClassificationHead
-from transformers import GPT2Tokenizer
-from transformers.file_utils import cached_path
-from transformers.modeling_gpt2 import GPT2LMHeadModel
-
-
-PPLM_BOW = 1
-PPLM_DISCRIM = 2
-PPLM_BOW_DISCRIM = 3
-SMALL_CONST = 1e-15
-BIG_CONST = 1e10
-
-BAG_OF_WORDS_ARCHIVE_MAP = {
-    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
-    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
-    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
-    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
-    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
-    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
-    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
-}
-
-DISCRIMINATOR_MODELS_PARAMS = {
-    "clickbait": {
-        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
-        "class_size": 2,
-        "embed_size": 1024,
-        "class_vocab": {"non_clickbait": 0, "clickbait": 1},
-        "default_class": 1,
-        "pretrained_model": "gpt2-medium",
-    },
-    "sentiment": {
-        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
-        "class_size": 5,
-        "embed_size": 1024,
-        "class_vocab": {"very_positive": 2, "very_negative": 3},
-        "default_class": 3,
-        "pretrained_model": "gpt2-medium",
-    },
-}
-
-
-def to_var(x, requires_grad=False, volatile=False, device="cuda"):
-    if torch.cuda.is_available() and device == "cuda":
-        x = x.cuda()
-    elif device != "cuda":
-        x = x.to(device)
-    return Variable(x, requires_grad=requires_grad, volatile=volatile)
-
-
-def top_k_filter(logits, k, probs=False):
-    """
-    Masks everything but the k top entries as -infinity (1e10).
-    Used to mask logits such that e^-infinity -> 0 won't contribute to the
-    sum of the denominator.
-    """
-    if k == 0:
-        return logits
-    else:
-        values = torch.topk(logits, k)[0]
-        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
-        if probs:
-            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
-        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
-
-
-def perturb_past(
-    past,
-    model,
-    last,
-    unpert_past=None,
-    unpert_logits=None,
-    accumulated_hidden=None,
-    grad_norms=None,
-    stepsize=0.01,
-    one_hot_bows_vectors=None,
-    classifier=None,
-    class_label=None,
-    loss_type=0,
-    num_iterations=3,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    kl_scale=0.01,
-    device="cuda",
-):
-    # Generate inital perturbed past
-    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
-
-    if accumulated_hidden is None:
-        accumulated_hidden = 0
-
-    if decay:
-        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
-    else:
-        decay_mask = 1.0
-
-    # TODO fix this comment (SUMANTH)
-    # Generate a mask is gradient perturbated is based on a past window
-    _, _, _, curr_length, _ = past[0].shape
-
-    if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
-
-        zeros_key_val_shape = (
-            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
-        )
-
-        ones_mask = torch.ones(ones_key_val_shape)
-        ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
-        ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
-
-        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
-    else:
-        window_mask = torch.ones_like(past[0]).to(device)
-
-    # accumulate perturbations for num_iterations
-    loss_per_iter = []
-    new_accumulated_hidden = None
-    for i in range(num_iterations):
-        print("Iteration ", i + 1)
-        curr_perturbation = [
-            to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
-        ]
-
-        # Compute hidden using perturbed past
-        perturbed_past = list(map(add, past, curr_perturbation))
-        _, _, _, curr_length, _ = curr_perturbation[0].shape
-        all_logits, _, all_hidden = model(last, past=perturbed_past)
-        hidden = all_hidden[-1]
-        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
-        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
-        logits = all_logits[:, -1, :]
-        probs = F.softmax(logits, dim=-1)
-
-        loss = 0.0
-        loss_list = []
-        if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
-            for one_hot_bow in one_hot_bows_vectors:
-                bow_logits = torch.mm(probs, torch.t(one_hot_bow))
-                bow_loss = -torch.log(torch.sum(bow_logits))
-                loss += bow_loss
-                loss_list.append(bow_loss)
-            print(" pplm_bow_loss:", loss.data.cpu().numpy())
-
-        if loss_type == 2 or loss_type == 3:
-            ce_loss = torch.nn.CrossEntropyLoss()
-            # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
-            curr_unpert_past = unpert_past
-            curr_probs = torch.unsqueeze(probs, dim=1)
-            wte = model.resize_token_embeddings()
-            for _ in range(horizon_length):
-                inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
-                _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
-                curr_hidden = curr_all_hidden[-1]
-                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
-
-            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
-
-            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
-            discrim_loss = ce_loss(prediction, label)
-            print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
-            loss += discrim_loss
-            loss_list.append(discrim_loss)
-
-        kl_loss = 0.0
-        if kl_scale > 0.0:
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
-            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
-            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
-            corrected_probs = probs + correction.detach()
-            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
-            print(" kl_loss", kl_loss.data.cpu().numpy())
-            loss += kl_loss
-
-        loss_per_iter.append(loss.data.cpu().numpy())
-        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
-
-        # compute gradients
-        loss.backward()
-
-        # calculate gradient norms
-        if grad_norms is not None and loss_type == PPLM_BOW:
-            grad_norms = [
-                torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
-                for index, p_ in enumerate(curr_perturbation)
-            ]
-        else:
-            grad_norms = [
-                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
-            ]
-
-        # normalize gradients
-        grad = [
-            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
-            for index, p_ in enumerate(curr_perturbation)
-        ]
-
-        # accumulate gradient
-        grad_accumulator = list(map(add, grad, grad_accumulator))
-
-        # reset gradients, just to make sure
-        for p_ in curr_perturbation:
-            p_.grad.data.zero_()
-
-        # removing past from the graph
-        new_past = []
-        for p_ in past:
-            new_past.append(p_.detach())
-        past = new_past
-
-    # apply the accumulated perturbations to the past
-    grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
-    pert_past = list(map(add, past, grad_accumulator))
-
-    return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
-
-
-def get_classifier(
-    name: Optional[str], class_label: Union[str, int], device: str
-) -> Tuple[Optional[ClassificationHead], Optional[int]]:
-    if name is None:
-        return None, None
-
-    params = DISCRIMINATOR_MODELS_PARAMS[name]
-    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
-    if "url" in params:
-        resolved_archive_file = cached_path(params["url"])
-    elif "path" in params:
-        resolved_archive_file = params["path"]
-    else:
-        raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
-    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
-    classifier.eval()
-
-    if isinstance(class_label, str):
-        if class_label in params["class_vocab"]:
-            label_id = params["class_vocab"][class_label]
-        else:
-            label_id = params["default_class"]
-            print("class_label {} not in class_vocab".format(class_label))
-            print("available values are: {}".format(params["class_vocab"]))
-            print("using default class {}".format(label_id))
-
-    elif isinstance(class_label, int):
-        if class_label in set(params["class_vocab"].values()):
-            label_id = class_label
-        else:
-            label_id = params["default_class"]
-            print("class_label {} not in class_vocab".format(class_label))
-            print("available values are: {}".format(params["class_vocab"]))
-            print("using default class {}".format(label_id))
-
-    else:
-        label_id = params["default_class"]
-
-    return classifier, label_id
-
-
-def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
-    bow_indices = []
-    for id_or_path in bag_of_words_ids_or_paths:
-        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
-            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
-        else:
-            filepath = id_or_path
-        with open(filepath, "r") as f:
-            words = f.read().strip().split("\n")
-        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
-    return bow_indices
-
-
-def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
-    if bow_indices is None:
-        return None
-
-    one_hot_bows_vectors = []
-    for single_bow in bow_indices:
-        single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
-        single_bow = torch.tensor(single_bow).to(device)
-        num_words = single_bow.shape[0]
-        one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
-        one_hot_bow.scatter_(1, single_bow, 1)
-        one_hot_bows_vectors.append(one_hot_bow)
-    return one_hot_bows_vectors
-
-
-def full_text_generation(
-    model,
-    tokenizer,
-    context=None,
-    num_samples=1,
-    device="cuda",
-    bag_of_words=None,
-    discrim=None,
-    class_label=None,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    repetition_penalty=1.0,
-    **kwargs
-):
-    classifier, class_id = get_classifier(discrim, class_label, device)
-
-    bow_indices = []
-    if bag_of_words:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
-
-    if bag_of_words and classifier:
-        print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
-        loss_type = PPLM_BOW_DISCRIM
-
-    elif bag_of_words:
-        loss_type = PPLM_BOW
-        print("Using PPLM-BoW")
-
-    elif classifier is not None:
-        loss_type = PPLM_DISCRIM
-        print("Using PPLM-Discrim")
-
-    else:
-        raise Exception("Specify either a bag of words or a discriminator")
-
-    unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model,
-        tokenizer=tokenizer,
-        context=context,
-        device=device,
-        length=length,
-        sample=sample,
-        perturb=False,
-        repetition_penalty=repetition_penalty,
-    )
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    pert_gen_tok_texts = []
-    discrim_losses = []
-    losses_in_time = []
-
-    for i in range(num_samples):
-        pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
-            model=model,
-            tokenizer=tokenizer,
-            context=context,
-            device=device,
-            perturb=True,
-            bow_indices=bow_indices,
-            classifier=classifier,
-            class_label=class_id,
-            loss_type=loss_type,
-            length=length,
-            stepsize=stepsize,
-            temperature=temperature,
-            top_k=top_k,
-            sample=sample,
-            num_iterations=num_iterations,
-            grad_length=grad_length,
-            horizon_length=horizon_length,
-            window_length=window_length,
-            decay=decay,
-            gamma=gamma,
-            gm_scale=gm_scale,
-            kl_scale=kl_scale,
-            repetition_penalty=repetition_penalty,
-        )
-        pert_gen_tok_texts.append(pert_gen_tok_text)
-        if classifier is not None:
-            discrim_losses.append(discrim_loss.data.cpu().numpy())
-        losses_in_time.append(loss_in_time)
-
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
-
-
-def generate_text_pplm(
-    model,
-    tokenizer,
-    context=None,
-    past=None,
-    device="cuda",
-    perturb=True,
-    bow_indices=None,
-    classifier=None,
-    class_label=None,
-    loss_type=0,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    repetition_penalty=1.0,
-):
-    output_so_far = None
-    if context:
-        context_t = torch.tensor(context, device=device, dtype=torch.long)
-        while len(context_t.shape) < 2:
-            context_t = context_t.unsqueeze(0)
-        output_so_far = context_t
-
-    # collect one hot vectors for bags of words
-    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
-
-    grad_norms = None
-    last = None
-    unpert_discrim_loss = 0
-    loss_in_time = []
-    for i in trange(length, ascii=True):
-
-        # Get past/probs for current output, except for last word
-        # Note that GPT takes 2 inputs: past + current_token
-
-        # run model forward to obtain unperturbed
-        if past is None and output_so_far is not None:
-            last = output_so_far[:, -1:]
-            if output_so_far.shape[1] > 1:
-                _, past, _ = model(output_so_far[:, :-1])
-
-        unpert_logits, unpert_past, unpert_all_hidden = model(output_so_far)
-        unpert_last_hidden = unpert_all_hidden[-1]
-
-        # check if we are abowe grad max length
-        if i >= grad_length:
-            current_stepsize = stepsize * 0
-        else:
-            current_stepsize = stepsize
-
-        # modify the past if necessary
-        if not perturb or num_iterations == 0:
-            pert_past = past
-
-        else:
-            accumulated_hidden = unpert_last_hidden[:, :-1, :]
-            accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
-
-            if past is not None:
-                pert_past, _, grad_norms, loss_this_iter = perturb_past(
-                    past,
-                    model,
-                    last,
-                    unpert_past=unpert_past,
-                    unpert_logits=unpert_logits,
-                    accumulated_hidden=accumulated_hidden,
-                    grad_norms=grad_norms,
-                    stepsize=current_stepsize,
-                    one_hot_bows_vectors=one_hot_bows_vectors,
-                    classifier=classifier,
-                    class_label=class_label,
-                    loss_type=loss_type,
-                    num_iterations=num_iterations,
-                    horizon_length=horizon_length,
-                    window_length=window_length,
-                    decay=decay,
-                    gamma=gamma,
-                    kl_scale=kl_scale,
-                    device=device,
-                )
-                loss_in_time.append(loss_this_iter)
-            else:
-                pert_past = past
-
-        pert_logits, past, pert_all_hidden = model(last, past=pert_past)
-        pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
-
-        for token_idx in set(output_so_far[0].tolist()):
-            if pert_logits[0, token_idx] < 0:
-                pert_logits[0, token_idx] *= repetition_penalty
-            else:
-                pert_logits[0, token_idx] /= repetition_penalty
-
-        pert_probs = F.softmax(pert_logits, dim=-1)
-
-        if classifier is not None:
-            ce_loss = torch.nn.CrossEntropyLoss()
-            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
-            label = torch.tensor([class_label], device=device, dtype=torch.long)
-            unpert_discrim_loss = ce_loss(prediction, label)
-            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
-        else:
-            unpert_discrim_loss = 0
-
-        # Fuse the modified model and original model
-        if perturb:
-
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
-
-            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
-            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
-
-            # rescale
-            if torch.sum(pert_probs) <= 1:
-                pert_probs = pert_probs / torch.sum(pert_probs)
-
-        else:
-            pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
-            pert_probs = F.softmax(pert_logits, dim=-1)
-
-        # sample or greedy
-        if sample:
-            last = torch.multinomial(pert_probs, num_samples=1)
-
-        else:
-            _, last = torch.topk(pert_probs, k=1, dim=-1)
-
-        # update context/output_so_far appending the new token
-        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
-
-        print(tokenizer.decode(output_so_far.tolist()[0]))
-
-    return output_so_far, unpert_discrim_loss, loss_in_time
-
-
-def set_generic_model_params(discrim_weights, discrim_meta):
-    if discrim_weights is None:
-        raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
-    if discrim_meta is None:
-        raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")
-
-    with open(discrim_meta, "r") as discrim_meta_file:
-        meta = json.load(discrim_meta_file)
-    meta["path"] = discrim_weights
-    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
-
-
-def run_pplm_example(
-    pretrained_model="gpt2-medium",
-    cond_text="",
-    uncond=False,
-    num_samples=1,
-    bag_of_words=None,
-    discrim=None,
-    discrim_weights=None,
-    discrim_meta=None,
-    class_label=-1,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    seed=0,
-    no_cuda=False,
-    colorama=False,
-    repetition_penalty=1.0,
-):
-    # set Random seed
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    # set the device
-    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
-
-    if discrim == "generic":
-        set_generic_model_params(discrim_weights, discrim_meta)
-
-    if discrim is not None:
-        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
-        print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))
-
-    # load pretrained model
-    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
-    model.to(device)
-    model.eval()
-
-    # load tokenizer
-    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
-
-    # Freeze GPT-2 weights
-    for param in model.parameters():
-        param.requires_grad = False
-
-    # figure out conditioning text
-    if uncond:
-        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
-    else:
-        raw_text = cond_text
-        while not raw_text:
-            print("Did you forget to add `--cond_text`? ")
-            raw_text = input("Model prompt >>> ")
-        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
-
-    print("= Prefix of sentence =")
-    print(tokenizer.decode(tokenized_cond_text))
-    print()
-
-    # generate unperturbed and perturbed texts
-
-    # full_text_generation returns:
-    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
-    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
-        model=model,
-        tokenizer=tokenizer,
-        context=tokenized_cond_text,
-        device=device,
-        num_samples=num_samples,
-        bag_of_words=bag_of_words,
-        discrim=discrim,
-        class_label=class_label,
-        length=length,
-        stepsize=stepsize,
-        temperature=temperature,
-        top_k=top_k,
-        sample=sample,
-        num_iterations=num_iterations,
-        grad_length=grad_length,
-        horizon_length=horizon_length,
-        window_length=window_length,
-        decay=decay,
-        gamma=gamma,
-        gm_scale=gm_scale,
-        kl_scale=kl_scale,
-        repetition_penalty=repetition_penalty,
-    )
-
-    # untokenize unperturbed text
-    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
-
-    print("=" * 80)
-    print("= Unperturbed generated text =")
-    print(unpert_gen_text)
-    print()
-
-    generated_texts = []
-
-    bow_word_ids = set()
-    if bag_of_words and colorama:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
-        for single_bow_list in bow_indices:
-            # filtering all words in the list composed of more than 1 token
-            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
-            # w[0] because we are sure w has only 1 item because previous fitler
-            bow_word_ids.update(w[0] for w in filtered)
-
-    # iterate through the perturbed texts
-    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
-        try:
-            # untokenize unperturbed text
-            if colorama:
-                import colorama
-
-                pert_gen_text = ""
-                for word_id in pert_gen_tok_text.tolist()[0]:
-                    if word_id in bow_word_ids:
-                        pert_gen_text += "{}{}{}".format(
-                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
-                        )
-                    else:
-                        pert_gen_text += tokenizer.decode([word_id])
-            else:
-                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
-
-            print("= Perturbed generated text {} =".format(i + 1))
-            print(pert_gen_text)
-            print()
-        except Exception as exc:
-            print("Ignoring error while generating perturbed text:", exc)
-
-        # keep the prefix, perturbed seq, original seq for each index
-        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
-
-    return
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model",
-        "-M",
-        type=str,
-        default="gpt2-medium",
-        help="pretrained model name or path to local checkpoint",
-    )
-    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
-    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
-    parser.add_argument(
-        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
-    )
-    parser.add_argument(
-        "--bag_of_words",
-        "-B",
-        type=str,
-        default=None,
-        help="Bags of words used for PPLM-BoW. "
-        "Either a BOW id (see list in code) or a filepath. "
-        "Multiple BoWs separated by ;",
-    )
-    parser.add_argument(
-        "--discrim",
-        "-D",
-        type=str,
-        default=None,
-        choices=("clickbait", "sentiment", "toxicity", "generic"),
-        help="Discriminator to use",
-    )
-    parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
-    parser.add_argument(
-        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
-    )
-    parser.add_argument(
-        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
-    )
-    parser.add_argument("--length", type=int, default=100)
-    parser.add_argument("--stepsize", type=float, default=0.02)
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_k", type=int, default=10)
-    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
-    parser.add_argument("--num_iterations", type=int, default=3)
-    parser.add_argument("--grad_length", type=int, default=10000)
-    parser.add_argument(
-        "--window_length",
-        type=int,
-        default=0,
-        help="Length of past which is being optimized; " "0 corresponds to infinite window length",
-    )
-    parser.add_argument(
-        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
-    )
-    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
-    parser.add_argument("--gamma", type=float, default=1.5)
-    parser.add_argument("--gm_scale", type=float, default=0.9)
-    parser.add_argument("--kl_scale", type=float, default=0.01)
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
-    parser.add_argument("--colorama", action="store_true", help="colors keywords")
-    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
-    )
-
-    args = parser.parse_args()
-    run_pplm_example(**vars(args))
diff --git a/server/transformers/examples/pplm/run_pplm_discrim_train.py b/server/transformers/examples/pplm/run_pplm_discrim_train.py
deleted file mode 100644
index ce6f583dc6d8bfe3c0d4612ce76adbeaaf7572e4..0000000000000000000000000000000000000000
--- a/server/transformers/examples/pplm/run_pplm_discrim_train.py
+++ /dev/null
@@ -1,517 +0,0 @@
-#! /usr/bin/env python3
-# coding=utf-8
-
-# Copyright (c) 2019 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import csv
-import json
-import math
-import time
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.optim as optim
-import torch.utils.data as data
-from nltk.tokenize.treebank import TreebankWordDetokenizer
-from torchtext import data as torchtext_data
-from torchtext import datasets
-from tqdm import tqdm, trange
-
-from pplm_classification_head import ClassificationHead
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-
-
-torch.manual_seed(0)
-np.random.seed(0)
-EPSILON = 1e-10
-example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
-max_length_seq = 100
-
-
-class Discriminator(torch.nn.Module):
-    """Transformer encoder followed by a Classification Head"""
-
-    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
-        super().__init__()
-        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
-        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
-        self.embed_size = self.encoder.transformer.config.hidden_size
-        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
-        self.cached_mode = cached_mode
-        self.device = device
-
-    def get_classifier(self):
-        return self.classifier_head
-
-    def train_custom(self):
-        for param in self.encoder.parameters():
-            param.requires_grad = False
-        self.classifier_head.train()
-
-    def avg_representation(self, x):
-        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
-        hidden, _ = self.encoder.transformer(x)
-        masked_hidden = hidden * mask
-        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
-        return avg_hidden
-
-    def forward(self, x):
-        if self.cached_mode:
-            avg_hidden = x.to(self.device)
-        else:
-            avg_hidden = self.avg_representation(x.to(self.device))
-
-        logits = self.classifier_head(avg_hidden)
-        probs = F.log_softmax(logits, dim=-1)
-
-        return probs
-
-
-class Dataset(data.Dataset):
-    def __init__(self, X, y):
-        """Reads source and target sequences from txt files."""
-        self.X = X
-        self.y = y
-
-    def __len__(self):
-        return len(self.X)
-
-    def __getitem__(self, index):
-        """Returns one data pair (source and target)."""
-        data = {}
-        data["X"] = self.X[index]
-        data["y"] = self.y[index]
-        return data
-
-
-def collate_fn(data):
-    def pad_sequences(sequences):
-        lengths = [len(seq) for seq in sequences]
-
-        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0
-
-        for i, seq in enumerate(sequences):
-            end = lengths[i]
-            padded_sequences[i, :end] = seq[:end]
-
-        return padded_sequences, lengths
-
-    item_info = {}
-    for key in data[0].keys():
-        item_info[key] = [d[key] for d in data]
-
-    x_batch, _ = pad_sequences(item_info["X"])
-    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
-
-    return x_batch, y_batch
-
-
-def cached_collate_fn(data):
-    item_info = {}
-    for key in data[0].keys():
-        item_info[key] = [d[key] for d in data]
-
-    x_batch = torch.cat(item_info["X"], 0)
-    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
-
-    return x_batch, y_batch
-
-
-def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
-    samples_so_far = 0
-    discriminator.train_custom()
-    for batch_idx, (input_t, target_t) in enumerate(data_loader):
-        input_t, target_t = input_t.to(device), target_t.to(device)
-
-        optimizer.zero_grad()
-
-        output_t = discriminator(input_t)
-        loss = F.nll_loss(output_t, target_t)
-        loss.backward(retain_graph=True)
-        optimizer.step()
-
-        samples_so_far += len(input_t)
-
-        if batch_idx % log_interval == 0:
-            print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch + 1,
-                    samples_so_far,
-                    len(data_loader.dataset),
-                    100 * samples_so_far / len(data_loader.dataset),
-                    loss.item(),
-                )
-            )
-
-
-def evaluate_performance(data_loader, discriminator, device="cpu"):
-    discriminator.eval()
-    test_loss = 0
-    correct = 0
-    with torch.no_grad():
-        for input_t, target_t in data_loader:
-            input_t, target_t = input_t.to(device), target_t.to(device)
-            output_t = discriminator(input_t)
-            # sum up batch loss
-            test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
-            # get the index of the max log-probability
-            pred_t = output_t.argmax(dim=1, keepdim=True)
-            correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
-
-    test_loss /= len(data_loader.dataset)
-
-    print(
-        "Performance on test set: "
-        "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
-            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
-        )
-    )
-
-
-def predict(input_sentence, model, classes, cached=False, device="cpu"):
-    input_t = model.tokenizer.encode(input_sentence)
-    input_t = torch.tensor([input_t], dtype=torch.long, device=device)
-    if cached:
-        input_t = model.avg_representation(input_t)
-
-    log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
-    print("Input sentence:", input_sentence)
-    print(
-        "Predictions:",
-        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
-    )
-
-
-def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
-    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
-
-    xs = []
-    ys = []
-    for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
-        with torch.no_grad():
-            x = x.to(device)
-            avg_rep = discriminator.avg_representation(x).cpu().detach()
-            avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
-            xs += avg_rep_list
-            ys += y.cpu().numpy().tolist()
-
-    data_loader = torch.utils.data.DataLoader(
-        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
-    )
-
-    return data_loader
-
-
-def train_discriminator(
-    dataset,
-    dataset_fp=None,
-    pretrained_model="gpt2-medium",
-    epochs=10,
-    batch_size=64,
-    log_interval=10,
-    save_model=False,
-    cached=False,
-    no_cuda=False,
-):
-    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
-
-    print("Preprocessing {} dataset...".format(dataset))
-    start = time.time()
-
-    if dataset == "SST":
-        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        text = torchtext_data.Field()
-        label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
-
-        x = []
-        y = []
-        for i in trange(len(train_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
-            seq = discriminator.tokenizer.encode(seq)
-            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-            x.append(seq)
-            y.append(class2idx[vars(train_data[i])["label"]])
-        train_dataset = Dataset(x, y)
-
-        test_x = []
-        test_y = []
-        for i in trange(len(test_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
-            seq = discriminator.tokenizer.encode(seq)
-            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-            test_x.append(seq)
-            test_y.append(class2idx[vars(test_data[i])["label"]])
-        test_dataset = Dataset(test_x, test_y)
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 2,
-        }
-
-    elif dataset == "clickbait":
-        idx2class = ["non_clickbait", "clickbait"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
-            data = []
-            for i, line in enumerate(f):
-                try:
-                    data.append(eval(line))
-                except Exception:
-                    print("Error evaluating line {}: {}".format(i, line))
-                    continue
-        x = []
-        y = []
-        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
-            for i, line in enumerate(tqdm(f, ascii=True)):
-                try:
-                    d = eval(line)
-                    seq = discriminator.tokenizer.encode(d["text"])
-
-                    if len(seq) < max_length_seq:
-                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-                    else:
-                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                        continue
-                    x.append(seq)
-                    y.append(d["label"])
-                except Exception:
-                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
-                    pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 1,
-        }
-
-    elif dataset == "toxic":
-        idx2class = ["non_toxic", "toxic"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        x = []
-        y = []
-        with open("datasets/toxic/toxic_train.txt") as f:
-            for i, line in enumerate(tqdm(f, ascii=True)):
-                try:
-                    d = eval(line)
-                    seq = discriminator.tokenizer.encode(d["text"])
-
-                    if len(seq) < max_length_seq:
-                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-                    else:
-                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                        continue
-                    x.append(seq)
-                    y.append(int(np.sum(d["label"]) > 0))
-                except Exception:
-                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
-                    pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 0,
-        }
-
-    else:  # if dataset == "generic":
-        # This assumes the input dataset is a TSV with the following structure:
-        # class \t text
-
-        if dataset_fp is None:
-            raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")
-
-        classes = set()
-        with open(dataset_fp) as f:
-            csv_reader = csv.reader(f, delimiter="\t")
-            for row in tqdm(csv_reader, ascii=True):
-                if row:
-                    classes.add(row[0])
-
-        idx2class = sorted(classes)
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        x = []
-        y = []
-        with open(dataset_fp) as f:
-            csv_reader = csv.reader(f, delimiter="\t")
-            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
-                if row:
-                    label = row[0]
-                    text = row[1]
-
-                    try:
-                        seq = discriminator.tokenizer.encode(text)
-                        if len(seq) < max_length_seq:
-                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-
-                        else:
-                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                            continue
-
-                        x.append(seq)
-                        y.append(class2idx[label])
-
-                    except Exception:
-                        print("Error tokenizing line {}, skipping it".format(i))
-                        pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 0,
-        }
-
-    end = time.time()
-    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
-    print("Data preprocessing took: {:.3f}s".format(end - start))
-
-    if cached:
-        print("Building representation cache...")
-
-        start = time.time()
-
-        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
-
-        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
-
-        end = time.time()
-        print("Building representation cache took: {:.3f}s".format(end - start))
-
-    else:
-        train_loader = torch.utils.data.DataLoader(
-            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
-        )
-        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
-
-    if save_model:
-        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
-            json.dump(discriminator_meta, meta_file)
-
-    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
-
-    for epoch in range(epochs):
-        start = time.time()
-        print("\nEpoch", epoch + 1)
-
-        train_epoch(
-            discriminator=discriminator,
-            data_loader=train_loader,
-            optimizer=optimizer,
-            epoch=epoch,
-            log_interval=log_interval,
-            device=device,
-        )
-        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
-
-        end = time.time()
-        print("Epoch took: {:.3f}s".format(end - start))
-
-        print("\nExample prediction")
-        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
-
-        if save_model:
-            # torch.save(discriminator.state_dict(),
-            #           "{}_discriminator_{}.pt".format(
-            #               args.dataset, epoch + 1
-            #               ))
-            torch.save(
-                discriminator.get_classifier().state_dict(),
-                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="SST",
-        choices=("SST", "clickbait", "toxic", "generic"),
-        help="dataset to train the discriminator on."
-        "In case of generic, the dataset is expected"
-        "to be a TSBV file with structure: class \\t text",
-    )
-    parser.add_argument(
-        "--dataset_fp",
-        type=str,
-        default="",
-        help="File path of the dataset to use. " "Needed only in case of generic datadset",
-    )
-    parser.add_argument(
-        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
-    )
-    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
-    parser.add_argument(
-        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
-    )
-    parser.add_argument(
-        "--log_interval",
-        type=int,
-        default=10,
-        metavar="N",
-        help="how many batches to wait before logging training status",
-    )
-    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
-    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
-    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
-    args = parser.parse_args()
-
-    train_discriminator(**(vars(args)))
diff --git a/server/transformers/examples/requirements.txt b/server/transformers/examples/requirements.txt
deleted file mode 100644
index 36229755e81885681fd14a80eff8325cbc6053f5..0000000000000000000000000000000000000000
--- a/server/transformers/examples/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-tensorboardX
-tensorboard
-scikit-learn
-seqeval
diff --git a/server/transformers/examples/run_bertology.py b/server/transformers/examples/run_bertology.py
deleted file mode 100644
index acac56128a05f6a8c05149234e474dc35ef348df..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_bertology.py
+++ /dev/null
@@ -1,426 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2018 CMU and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Bertology: this script shows how you can explore the internals of the models in the library to:
-    - compute the entropy of the head attentions
-    - compute the importance of each head
-    - prune (remove) the low importance head.
-    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
-    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
-"""
-import argparse
-import logging
-import os
-from datetime import datetime
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, SequentialSampler, Subset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-
-
-logger = logging.getLogger(__name__)
-
-
-def entropy(p):
-    """ Compute the entropy of a probability distribution """
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_2d_tensor(tensor):
-    """ Print a 2D tensor """
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        if tensor.dtype != torch.long:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
-        else:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
-
-
-def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
-):
-    """ This method shows how to compute:
-        - head attention entropy
-        - head importance scores according to http://arxiv.org/abs/1905.10650
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-
-    if head_mask is None:
-        head_mask = torch.ones(n_layers, n_heads).to(args.device)
-    head_mask.requires_grad_(requires_grad=True)
-    preds = None
-    labels = None
-    tot_tokens = 0.0
-
-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(
-            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
-        )
-        loss, logits, all_attentions = (
-            outputs[0],
-            outputs[1],
-            outputs[-1],
-        )  # Loss and logits are the first, attention the last
-        loss.backward()  # Backpropagate to populate the gradients in the head mask
-
-        if compute_entropy:
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
-
-        if compute_importance:
-            head_importance += head_mask.grad.abs().detach()
-
-        # Also store our logits/labels if we want to compute metrics afterwards
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
-
-        tot_tokens += input_mask.float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    # Print/save matrices
-    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())
-
-    logger.info("Attention entropies")
-    print_2d_tensor(attn_entropy)
-    logger.info("Head importance scores")
-    print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
-        head_importance.numel(), device=args.device
-    )
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-
-    return attn_entropy, head_importance, preds, labels
-
-
-def mask_heads(args, model, eval_dataloader):
-    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
-        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-    new_head_mask = torch.ones_like(head_importance)
-    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-    current_score = original_score
-    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone()  # save current head mask
-        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float("Inf")
-        current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-        if len(current_heads_to_mask) <= num_to_mask:
-            break
-
-        # mask heads
-        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-        new_head_mask = new_head_mask.view(-1)
-        new_head_mask[current_heads_to_mask] = 0.0
-        new_head_mask = new_head_mask.view_as(head_mask)
-        print_2d_tensor(new_head_mask)
-
-        # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(
-            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
-        )
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info(
-            "Masking: current score: %f, remaning heads %d (%.1f percents)",
-            current_score,
-            new_head_mask.sum(),
-            new_head_mask.sum() / new_head_mask.numel() * 100,
-        )
-
-    logger.info("Final head mask")
-    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
-
-    return head_mask
-
-
-def prune_heads(args, model, eval_dataloader, head_mask):
-    """ This method shows how to prune head (remove heads weights) based on
-        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    # Try pruning and test time speedup
-    # Pruning is like masking but we actually remove the masked weights
-    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
-    )
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    original_time = datetime.now() - before_time
-
-    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
-    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-    model.prune_heads(heads_to_prune)
-    pruned_num_params = sum(p.numel() for p in model.parameters())
-
-    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
-    )
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    new_time = datetime.now() - before_time
-
-    logger.info(
-        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
-        original_num_params,
-        pruned_num_params,
-        pruned_num_params / original_num_params * 100,
-    )
-    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
-    )
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-
-    parser.add_argument(
-        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
-    )
-    parser.add_argument(
-        "--dont_normalize_global_importance",
-        action="store_true",
-        help="Don't normalize all importance scores between 0 and 1",
-    )
-
-    parser.add_argument(
-        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
-    )
-    parser.add_argument(
-        "--masking_threshold",
-        default=0.9,
-        type=float,
-        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
-    )
-    parser.add_argument(
-        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
-    )
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after WordPiece tokenization. \n"
-        "Sequences longer than this will be truncated, sequences shorter padded.",
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        args.n_gpu = 1
-        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
-
-    # Set seeds
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name_or_path.lower():
-            args.model_type = key  # take the first match in model types
-            break
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        output_attentions=True,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    # Distributed and parallel training
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-    elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Print/save training arguments
-    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Prepare dataset for the GLUE task
-    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
-    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    # Compute head entropy and importance score
-    compute_heads_importance(args, model, eval_dataloader)
-
-    # Try head masking (set heads to zero until the score goes under a threshole)
-    # and head pruning (remove masked heads and see the effect on the network)
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        head_mask = mask_heads(args, model, eval_dataloader)
-        prune_heads(args, model, eval_dataloader, head_mask)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_generation.py b/server/transformers/examples/run_generation.py
deleted file mode 100644
index d074c9e2642c753bec2766f9317fd511c6a4e3a4..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_generation.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
-"""
-
-
-import argparse
-import logging
-
-import numpy as np
-import torch
-
-from transformers import (
-    CTRLLMHeadModel,
-    CTRLTokenizer,
-    GPT2LMHeadModel,
-    GPT2Tokenizer,
-    OpenAIGPTLMHeadModel,
-    OpenAIGPTTokenizer,
-    TransfoXLLMHeadModel,
-    TransfoXLTokenizer,
-    XLMTokenizer,
-    XLMWithLMHeadModel,
-    XLNetLMHeadModel,
-    XLNetTokenizer,
-)
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
-
-MODEL_CLASSES = {
-    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
-    "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
-    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
-    "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
-    "xlm": (XLMWithLMHeadModel, XLMTokenizer),
-}
-
-# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
-# in https://github.com/rusiaaman/XLNet-gen#methodology
-# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
-PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
-(except for Alexei and Maria) are discovered.
-The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-remainder of the story. 1883 Western Siberia,
-a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-Rasputin has a vision and denounces one of the men as a horse thief. Although his
-father initially slaps him for making such an accusation, Rasputin watches as the
-man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
-
-def set_seed(args):
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-#
-# Functions to prepare models' input
-#
-
-
-def prepare_ctrl_input(args, _, tokenizer, prompt_text):
-    if args.temperature > 0.7:
-        logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
-
-    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
-    if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
-        logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
-    return prompt_text
-
-
-def prepare_xlm_input(args, model, tokenizer, prompt_text):
-    # kwargs = {"language": None, "mask_token_id": None}
-
-    # Set the language
-    use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
-    if hasattr(model.config, "lang2id") and use_lang_emb:
-        available_languages = model.config.lang2id.keys()
-        if args.xlm_language in available_languages:
-            language = args.xlm_language
-        else:
-            language = None
-            while language not in available_languages:
-                language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
-        # kwargs["language"] = tokenizer.lang2id[language]
-
-    # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
-    # XLM masked-language modeling (MLM) models need masked token
-    # is_xlm_mlm = "mlm" in args.model_name_or_path
-    # if is_xlm_mlm:
-    #     kwargs["mask_token_id"] = tokenizer.mask_token_id
-
-    return prompt_text
-
-
-def prepare_xlnet_input(args, _, tokenizer, prompt_text):
-    prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
-    return prompt_text, {}
-
-
-def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
-    prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
-    return prompt_text, {}
-
-
-PREPROCESSING_FUNCTIONS = {
-    "ctrl": prepare_ctrl_input,
-    "xlm": prepare_xlm_input,
-    "xlnet": prepare_xlnet_input,
-    "transfo-xl": prepare_transfoxl_input,
-}
-
-
-def adjust_length_to_model(length, max_sequence_length):
-    if length < 0 and max_sequence_length > 0:
-        length = max_sequence_length
-    elif 0 < max_sequence_length < length:
-        length = max_sequence_length  # No generation bigger than model size
-    elif length < 0:
-        length = MAX_LENGTH  # avoid infinite loop
-    return length
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-
-    parser.add_argument("--prompt", type=str, default="")
-    parser.add_argument("--length", type=int, default=20)
-    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
-    )
-    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
-    )
-    parser.add_argument("--k", type=int, default=0)
-    parser.add_argument("--p", type=float, default=0.9)
-
-    parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.")
-    parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
-
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    args = parser.parse_args()
-
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = torch.cuda.device_count()
-
-    set_seed(args)
-
-    # Initialize the model and tokenizer
-    try:
-        args.model_type = args.model_type.lower()
-        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    except KeyError:
-        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
-
-    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-    model = model_class.from_pretrained(args.model_name_or_path)
-    model.to(args.device)
-
-    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
-    logger.info(args)
-
-    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
-
-    # Different models need different input formatting and/or extra arguments
-    requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
-    if requires_preprocessing:
-        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
-        prompt_text = prepare_input(args, model, tokenizer, prompt_text)
-    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
-    encoded_prompt = encoded_prompt.to(args.device)
-
-    output_sequences = model.generate(
-        input_ids=encoded_prompt,
-        max_length=args.length,
-        temperature=args.temperature,
-        top_k=args.k,
-        top_p=args.p,
-        repetition_penalty=args.repetition_penalty,
-        do_sample=True,
-    )
-
-    # Batch size == 1. to add more examples please use num_return_sequences > 1
-    generated_sequence = output_sequences[0].tolist()
-    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
-    text = text[: text.find(args.stop_token) if args.stop_token else None]
-
-    print(text)
-
-    return text
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_glue.py b/server/transformers/examples/run_glue.py
deleted file mode 100644
index dc8f66434bb8377050ea02396c0bcbe8e96fb1ff..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_glue.py
+++ /dev/null
@@ -1,698 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
-
-
-import argparse
-import glob
-import json
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AlbertConfig,
-    AlbertForSequenceClassification,
-    AlbertTokenizer,
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForSequenceClassification,
-    DistilBertTokenizer,
-    FlaubertConfig,
-    FlaubertForSequenceClassification,
-    FlaubertTokenizer,
-    RobertaConfig,
-    RobertaForSequenceClassification,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMForSequenceClassification,
-    XLMRobertaConfig,
-    XLMRobertaForSequenceClassification,
-    XLMRobertaTokenizer,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForSequenceClassification,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (
-            BertConfig,
-            XLNetConfig,
-            XLMConfig,
-            RobertaConfig,
-            DistilBertConfig,
-            AlbertConfig,
-            XLMRobertaConfig,
-            FlaubertConfig,
-        )
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
-    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
-    "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
-    )
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
-                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
-                    logs["loss"] = loss_scalar
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{"step": global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
-                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif args.output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(eval_task, preds, out_label_ids)
-        results.update(result)
-
-        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
-            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = (
-            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
-    )
-
-    parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
-    )
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_lm_finetuning.py b/server/transformers/examples/run_lm_finetuning.py
deleted file mode 100644
index 663881649d815772a0e4ff02367992fba3883425..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_lm_finetuning.py
+++ /dev/null
@@ -1,790 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
-GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
-using a masked language modeling (MLM) loss.
-"""
-
-
-import argparse
-import glob
-import logging
-import os
-import pickle
-import random
-import re
-import shutil
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMaskedLM,
-    BertTokenizer,
-    CamembertConfig,
-    CamembertForMaskedLM,
-    CamembertTokenizer,
-    DistilBertConfig,
-    DistilBertForMaskedLM,
-    DistilBertTokenizer,
-    GPT2Config,
-    GPT2LMHeadModel,
-    GPT2Tokenizer,
-    OpenAIGPTConfig,
-    OpenAIGPTLMHeadModel,
-    OpenAIGPTTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    RobertaConfig,
-    RobertaForMaskedLM,
-    RobertaTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CLASSES = {
-    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
-    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
-}
-
-
-class TextDataset(Dataset):
-    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
-        assert os.path.isfile(file_path)
-        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(
-            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
-        )
-
-        if os.path.exists(cached_features_file) and not args.overwrite_cache:
-            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, "rb") as handle:
-                self.examples = pickle.load(handle)
-        else:
-            logger.info("Creating features from dataset file at %s", directory)
-
-            self.examples = []
-            with open(file_path, encoding="utf-8") as f:
-                text = f.read()
-
-            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-
-            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
-            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
-            # If your dataset is small, first you should loook for a bigger one :-) and second you
-            # can change this behavior by adding (model specific) padding.
-
-            logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, "wb") as handle:
-                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, item):
-        return torch.tensor(self.examples[item])
-
-
-class LineByLineTextDataset(Dataset):
-    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
-        assert os.path.isfile(file_path)
-        # Here, we do not cache the features, operating under the assumption
-        # that we will soon use fast multithreaded tokenizers from the
-        # `tokenizers` repo everywhere =)
-        logger.info("Creating features from dataset file at %s", file_path)
-
-        with open(file_path, encoding="utf-8") as f:
-            lines = [line for line in f.read().splitlines() if len(line) > 0]
-
-        self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, i):
-        return torch.tensor(self.examples[i])
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False):
-    file_path = args.eval_data_file if evaluate else args.train_data_file
-    if args.line_by_line:
-        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
-    else:
-        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
-    ordering_and_checkpoint_path = []
-
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
-
-    for path in glob_checkpoints:
-        if use_mtime:
-            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
-        else:
-            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
-            if regex_match and regex_match.groups():
-                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
-
-    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
-    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
-    return checkpoints_sorted
-
-
-def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
-    if not args.save_total_limit:
-        return
-    if args.save_total_limit <= 0:
-        return
-
-    # Check if we should delete older checkpoint(s)
-    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
-    if len(checkpoints_sorted) <= args.save_total_limit:
-        return
-
-    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
-    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
-    for checkpoint in checkpoints_to_be_deleted:
-        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
-        shutil.rmtree(checkpoint)
-
-
-def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
-    labels = inputs.clone()
-    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [
-        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
-    ]
-    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
-    if tokenizer._pad_token is not None:
-        padding_mask = labels.eq(tokenizer.pad_token_id)
-        probability_matrix.masked_fill_(padding_mask, value=0.0)
-    masked_indices = torch.bernoulli(probability_matrix).bool()
-    labels[~masked_indices] = -100  # We only compute loss on masked tokens
-
-    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
-    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
-
-    # 10% of the time, we replace masked input tokens with random word
-    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
-    inputs[indices_random] = random_words[indices_random]
-
-    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-    return inputs, labels
-
-
-def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-
-    def collate(examples: List[torch.Tensor]):
-        if tokenizer._pad_token is None:
-            return pad_sequence(examples, batch_first=True)
-        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
-
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
-    )
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if (
-        args.model_name_or_path
-        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
-        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to gobal_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-
-    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
-    model_to_resize.resize_token_embeddings(len(tokenizer))
-
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
-            inputs = inputs.to(args.device)
-            labels = labels.to(args.device)
-            model.train()
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = "checkpoint"
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
-                    os.makedirs(output_dir, exist_ok=True)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    _rotate_checkpoints(args, checkpoint_prefix)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_output_dir = args.output_dir
-
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
-
-    if args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir, exist_ok=True)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-
-    def collate(examples: List[torch.Tensor]):
-        if tokenizer._pad_token is None:
-            return pad_sequence(examples, batch_first=True)
-        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
-
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
-    )
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    model.eval()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
-        inputs = inputs.to(args.device)
-        labels = labels.to(args.device)
-
-        with torch.no_grad():
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            lm_loss = outputs[0]
-            eval_loss += lm_loss.mean().item()
-        nb_eval_steps += 1
-
-    eval_loss = eval_loss / nb_eval_steps
-    perplexity = torch.exp(torch.tensor(eval_loss))
-
-    result = {"perplexity": perplexity}
-
-    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--eval_data_file",
-        default=None,
-        type=str,
-        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
-    )
-    parser.add_argument(
-        "--line_by_line",
-        action="store_true",
-        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
-    )
-    parser.add_argument(
-        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
-    )
-
-    parser.add_argument(
-        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
-    )
-    parser.add_argument(
-        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
-    )
-
-    parser.add_argument(
-        "--config_name",
-        default=None,
-        type=str,
-        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default=None,
-        type=str,
-        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
-    )
-    parser.add_argument(
-        "--block_size",
-        default=-1,
-        type=int,
-        help="Optional input sequence length after tokenization."
-        "The training dataset will be truncated in block of this size for training."
-        "Default to the model max input length for single sentence inputs (take into account special tokens).",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--save_total_limit",
-        type=int,
-        default=None,
-        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
-    )
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
-        raise ValueError(
-            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
-            "flag (masked language modeling)."
-        )
-    if args.eval_data_file is None and args.do_eval:
-        raise ValueError(
-            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-            "or remove the --do_eval argument."
-        )
-    if args.should_continue:
-        sorted_checkpoints = _sorted_checkpoints(args)
-        if len(sorted_checkpoints) == 0:
-            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
-        else:
-            args.model_name_or_path = sorted_checkpoints[-1]
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
-
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-
-    if args.config_name:
-        config = config_class.from_pretrained(args.config_name, cache_dir=args.cache_dir)
-    elif args.model_name_or_path:
-        config = config_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
-    else:
-        config = config_class()
-
-    if args.tokenizer_name:
-        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
-    elif args.model_name_or_path:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
-    else:
-        raise ValueError(
-            "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
-            "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
-        )
-
-    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence
-        # Our input block size will be the max possible for the model
-    else:
-        args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
-
-    if args.model_name_or_path:
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-            cache_dir=args.cache_dir,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = model_class(config=config)
-
-    model.to(args.device)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        if args.local_rank not in [-1, 0]:
-            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
-
-        if args.local_rank == 0:
-            torch.distributed.barrier()
-
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_multiple_choice.py b/server/transformers/examples/run_multiple_choice.py
deleted file mode 100644
index 72337c110fcb9fed295af13d4bd26906a9a55100..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_multiple_choice.py
+++ /dev/null
@@ -1,678 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    RobertaConfig,
-    RobertaForMultipleChoice,
-    RobertaTokenizer,
-    XLNetConfig,
-    XLNetForMultipleChoice,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from utils_multiple_choice import convert_examples_to_features, processors
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
-    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
-}
-
-
-def select_field(features, field):
-    return [[choice[field] for choice in feature.choices_features] for feature in features]
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    best_dev_acc = 0.0
-    best_steps = 0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2]
-                if args.model_type in ["bert", "xlnet"]
-                else None,  # XLM don't use segment_ids
-                "labels": batch[3],
-            }
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-            else:
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                        if results["eval_acc"] > best_dev_acc:
-                            best_dev_acc = results["eval_acc"]
-                            best_steps = global_step
-                            if args.do_test:
-                                results_test = evaluate(args, model, tokenizer, test=True)
-                                for key, value in results_test.items():
-                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
-                                logger.info(
-                                    "test acc: %s, loss: %s, global steps: %s",
-                                    str(results_test["eval_acc"]),
-                                    str(results_test["eval_loss"]),
-                                    str(global_step),
-                                )
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logger.info(
-                        "Average loss: %s at global step: %s",
-                        str((tr_loss - logging_loss) / args.logging_steps),
-                        str(global_step),
-                    )
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step, best_steps
-
-
-def evaluate(args, model, tokenizer, prefix="", test=False):
-    eval_task_names = (args.task_name,)
-    eval_outputs_dirs = (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu evaluate
-        if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2]
-                    if args.model_type in ["bert", "xlnet"]
-                    else None,  # XLM don't use segment_ids
-                    "labels": batch[3],
-                }
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        preds = np.argmax(preds, axis=1)
-        acc = simple_accuracy(preds, out_label_ids)
-        result = {"eval_acc": acc, "eval_loss": eval_loss}
-        results.update(result)
-
-        output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt")
-
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
-            writer.write("model           =%s\n" % str(args.model_name_or_path))
-            writer.write(
-                "total batch size=%d\n"
-                % (
-                    args.per_gpu_train_batch_size
-                    * args.gradient_accumulation_steps
-                    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
-                )
-            )
-            writer.write("train num epochs=%d\n" % args.num_train_epochs)
-            writer.write("fp16            =%s\n" % args.fp16)
-            writer.write("max seq length  =%d\n" % args.max_seq_length)
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    # Load data features from cache or dataset file
-    if evaluate:
-        cached_mode = "dev"
-    elif test:
-        cached_mode = "test"
-    else:
-        cached_mode = "train"
-    assert not (evaluate and test)
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            cached_mode,
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        if evaluate:
-            examples = processor.get_dev_examples(args.data_dir)
-        elif test:
-            examples = processor.get_test_examples(args.data_dir)
-        else:
-            examples = processor.get_train_examples(args.data_dir)
-        logger.info("Training number: %s", str(len(examples)))
-        features = convert_examples_to_features(
-            examples,
-            label_list,
-            args.max_seq_length,
-            tokenizer,
-            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
-    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
-
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-    best_steps = 0
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if not args.do_train:
-            args.output_dir = args.model_name_or_path
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    if args.do_test and args.local_rank in [-1, 0]:
-        if not args.do_train:
-            args.output_dir = args.model_name_or_path
-        checkpoints = [args.output_dir]
-        # if args.eval_all_checkpoints: # can not use this to do test!!
-        #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-    if best_steps:
-        logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_ner.py b/server/transformers/examples/run_ner.py
deleted file mode 100644
index a2937985ecbef23b6daf020ff9e68898584e4298..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_ner.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from seqeval.metrics import f1_score, precision_score, recall_score
-from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForTokenClassification,
-    BertTokenizer,
-    CamembertConfig,
-    CamembertForTokenClassification,
-    CamembertTokenizer,
-    DistilBertConfig,
-    DistilBertForTokenClassification,
-    DistilBertTokenizer,
-    RobertaConfig,
-    RobertaForTokenClassification,
-    RobertaTokenizer,
-    XLMRobertaConfig,
-    XLMRobertaForTokenClassification,
-    XLMRobertaTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
-    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
-    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
-    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM and RoBERTa don"t use segment_ids
-
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                scheduler.step()  # Update learning rate schedule
-                optimizer.step()
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation %s *****", prefix)
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    preds = None
-    out_label_ids = None
-    model.eval()
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM and RoBERTa don"t use segment_ids
-            outputs = model(**inputs)
-            tmp_eval_loss, logits = outputs[:2]
-
-            if args.n_gpu > 1:
-                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
-
-            eval_loss += tmp_eval_loss.item()
-        nb_eval_steps += 1
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            out_label_ids = inputs["labels"].detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    preds = np.argmax(preds, axis=2)
-
-    label_map = {i: label for i, label in enumerate(labels)}
-
-    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-    preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-    for i in range(out_label_ids.shape[0]):
-        for j in range(out_label_ids.shape[1]):
-            if out_label_ids[i, j] != pad_token_label_id:
-                out_label_list[i].append(label_map[out_label_ids[i][j]])
-                preds_list[i].append(label_map[preds[i][j]])
-
-    results = {
-        "loss": eval_loss,
-        "precision": precision_score(out_label_list, preds_list),
-        "recall": recall_score(out_label_list, preds_list),
-        "f1": f1_score(out_label_list, preds_list),
-    }
-
-    logger.info("***** Eval results %s *****", prefix)
-    for key in sorted(results.keys()):
-        logger.info("  %s = %s", key, str(results[key]))
-
-    return results, preds_list
-
-
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}".format(
-            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(
-            examples,
-            labels,
-            args.max_seq_length,
-            tokenizer,
-            cls_token_at_end=bool(args.model_type in ["xlnet"]),
-            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ["roberta"]),
-            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args.model_type in ["xlnet"]),
-            # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-            pad_token_label_id=pad_token_label_id,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--labels",
-        default="",
-        type=str,
-        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--evaluate_during_training",
-        action="store_true",
-        help="Whether to run evaluation during training at each logging step.",
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare CONLL-2003 task
-    labels = get_labels(args.labels)
-    num_labels = len(labels)
-    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
-    pad_token_label_id = CrossEntropyLoss().ignore_index
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
-            if global_step:
-                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
-            results.update(result)
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            for key in sorted(results.keys()):
-                writer.write("{} = {}\n".format(key, str(results[key])))
-
-    if args.do_predict and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model = model_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
-        # Save results
-        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(result.keys()):
-                writer.write("{} = {}\n".format(key, str(result[key])))
-        # Save predictions
-        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
-        with open(output_test_predictions_file, "w") as writer:
-            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
-                example_id = 0
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-                        if not predictions[example_id]:
-                            example_id += 1
-                    elif predictions[example_id]:
-                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
-                        writer.write(output_line)
-                    else:
-                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_squad.py b/server/transformers/examples/run_squad.py
deleted file mode 100644
index 86d00bd7701bba2f499a6b2123f71147dcb02461..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_squad.py
+++ /dev/null
@@ -1,837 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-import timeit
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AlbertConfig,
-    AlbertForQuestionAnswering,
-    AlbertTokenizer,
-    BertConfig,
-    BertForQuestionAnswering,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
-    RobertaConfig,
-    RobertaForQuestionAnswering,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMForQuestionAnswering,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-    squad_convert_examples_to_features,
-)
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_log_probs,
-    compute_predictions_logits,
-    squad_evaluate,
-)
-from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
-    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def to_list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 1
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to gobal_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    # Added here for reproductibility
-    set_seed(args)
-
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert"]:
-                del inputs["token_type_ids"]
-
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-                if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
-            outputs = model(**inputs)
-            # model outputs are always tuple in transformers (see doc)
-            loss = outputs[0]
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                # Log metrics
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Only evaluate when single GPU otherwise metrics may not average well
-                    if args.local_rank == -1 and args.evaluate_during_training:
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                # Save model checkpoint
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(model, "module") else model
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    all_results = []
-    start_time = timeit.default_timer()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert"]:
-                del inputs["token_type_ids"]
-
-            example_indices = batch[3]
-
-            # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-
-            outputs = model(**inputs)
-
-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-
-            output = [to_list(output[i]) for output in outputs]
-
-            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-            # models only use two.
-            if len(output) >= 5:
-                start_logits = output[0]
-                start_top_index = output[1]
-                end_logits = output[2]
-                end_top_index = output[3]
-                cls_logits = output[4]
-
-                result = SquadResult(
-                    unique_id,
-                    start_logits,
-                    end_logits,
-                    start_top_index=start_top_index,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-            else:
-                start_logits, end_logits = output
-                result = SquadResult(unique_id, start_logits, end_logits)
-
-            all_results.append(result)
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    else:
-        output_null_log_odds_file = None
-
-    # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ["xlnet", "xlm"]:
-        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
-
-        predictions = compute_predictions_log_probs(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            start_n_top,
-            end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        predictions = compute_predictions_logits(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-            tokenizer,
-        )
-
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    # Load data features from cache or dataset file
-    input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(
-        input_dir,
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-
-    # Init features and dataset from cache if it exists
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file)
-        features, dataset, examples = (
-            features_and_dataset["features"],
-            features_and_dataset["dataset"],
-            features_and_dataset["examples"],
-        )
-    else:
-        logger.info("Creating features from dataset file at %s", input_dir)
-
-        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
-            try:
-                import tensorflow_datasets as tfds
-            except ImportError:
-                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
-
-            if args.version_2_with_negative:
-                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
-
-            tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-        else:
-            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-            if evaluate:
-                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
-            else:
-                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=args.threads,
-        )
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        help="The input data dir. Should contain the .json files for the task."
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--train_file",
-        default=None,
-        type=str,
-        help="The input training file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        help="The input evaluation file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-        "longer than this will be truncated, and sequences shorter than this will be padded.",
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help="The maximum number of tokens for the question. Questions longer than this will "
-        "be truncated to this length.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help="The maximum length of an answer that can be generated. This is needed because the start "
-        "and end predictions are not conditioned on one another.",
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help="If true, all of the warnings related to data processing will be printed. "
-        "A number of warnings are expected for a normal SQuAD evaluation.",
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-    args = parser.parse_args()
-
-    if args.doc_stride >= args.max_seq_length - args.max_query_length:
-        logger.warning(
-            "WARNING - You've set a doc stride which may be superior to the document length in some "
-            "examples. This could result in errors when building features from the examples. Please reduce the doc "
-            "stride or increase the maximum length to ensure the features are correctly built."
-        )
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
-    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
-    # remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, "module") else model
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)  # , force_download=True)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            logger.info("Loading checkpoints saved during training for evaluation")
-            checkpoints = [args.output_dir]
-            if args.eval_all_checkpoints:
-                checkpoints = list(
-                    os.path.dirname(c)
-                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-                )
-                logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
-        else:
-            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
-            checkpoints = [args.model_name_or_path]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)  # , force_download=True)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/run_tf_glue.py b/server/transformers/examples/run_tf_glue.py
deleted file mode 100644
index dae11d22b365be2a417179b689110a44714c7d54..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_tf_glue.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-
-import tensorflow as tf
-import tensorflow_datasets
-
-from transformers import (
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    TFBertForSequenceClassification,
-    glue_convert_examples_to_features,
-    glue_processors,
-)
-
-
-# script parameters
-BATCH_SIZE = 32
-EVAL_BATCH_SIZE = BATCH_SIZE * 2
-USE_XLA = False
-USE_AMP = False
-EPOCHS = 3
-
-TASK = "mrpc"
-
-if TASK == "sst-2":
-    TFDS_TASK = "sst2"
-elif TASK == "sts-b":
-    TFDS_TASK = "stsb"
-else:
-    TFDS_TASK = TASK
-
-num_labels = len(glue_processors[TASK]().get_labels())
-print(num_labels)
-
-tf.config.optimizer.set_jit(USE_XLA)
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
-
-# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
-config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
-tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)
-
-# Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
-train_examples = info.splits["train"].num_examples
-
-# MNLI expects either validation_matched or validation_mismatched
-valid_examples = info.splits["validation"].num_examples
-
-# Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)
-
-# MNLI expects either validation_matched or validation_mismatched
-valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
-train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
-valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
-
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
-opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
-if USE_AMP:
-    # loss scaling is currently required when using mixed precision
-    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
-
-
-if num_labels == 1:
-    loss = tf.keras.losses.MeanSquaredError()
-else:
-    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-
-metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-model.compile(optimizer=opt, loss=loss, metrics=[metric])
-
-# Train and evaluate using tf.keras.Model.fit()
-train_steps = train_examples // BATCH_SIZE
-valid_steps = valid_examples // EVAL_BATCH_SIZE
-
-history = model.fit(
-    train_dataset,
-    epochs=EPOCHS,
-    steps_per_epoch=train_steps,
-    validation_data=valid_dataset,
-    validation_steps=valid_steps,
-)
-
-# Save TF2 model
-os.makedirs("./save/", exist_ok=True)
-model.save_pretrained("./save/")
-
-if TASK == "mrpc":
-    # Load the TensorFlow model in PyTorch for inspection
-    # This is to demo the interoperability between the two frameworks, you don't have to
-    # do this in real life (you can run the inference on the TF model).
-    pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
-
-    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-    sentence_0 = "This research was consistent with his findings."
-    sentence_1 = "His findings were compatible with this research."
-    sentence_2 = "His findings were not compatible with this research."
-    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
-    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
-
-    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-    print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-    print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
diff --git a/server/transformers/examples/run_tf_ner.py b/server/transformers/examples/run_tf_ner.py
deleted file mode 100644
index ef970d839016a49c8af076bacf386904ded9221e..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_tf_ner.py
+++ /dev/null
@@ -1,655 +0,0 @@
-# coding=utf-8
-import collections
-import datetime
-import glob
-import math
-import os
-import re
-
-import numpy as np
-import tensorflow as tf
-from absl import app, flags, logging
-from seqeval import metrics
-
-from transformers import (
-    TF2_WEIGHTS_NAME,
-    BertConfig,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertTokenizer,
-    GradientAccumulator,
-    RobertaConfig,
-    RobertaTokenizer,
-    TFBertForTokenClassification,
-    TFDistilBertForTokenClassification,
-    TFRobertaForTokenClassification,
-    create_optimizer,
-)
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
-
-
-try:
-    from fastprogress import master_bar, progress_bar
-except ImportError:
-    from fastprogress.fastprogress import master_bar, progress_bar
-
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
-    "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
-}
-
-
-flags.DEFINE_string(
-    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
-)
-
-flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-
-flags.DEFINE_string(
-    "model_name_or_path",
-    None,
-    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-)
-
-flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
-
-flags.DEFINE_string(
-    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
-)
-
-flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
-
-flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
-
-flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
-
-flags.DEFINE_integer(
-    "max_seq_length",
-    128,
-    "The maximum total input sentence length after tokenization. "
-    "Sequences longer than this will be truncated, sequences shorter "
-    "will be padded.",
-)
-
-flags.DEFINE_string(
-    "tpu",
-    None,
-    "The Cloud TPU to use for training. This should be either the name "
-    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.",
-)
-
-flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
-
-flags.DEFINE_boolean("do_train", False, "Whether to run training.")
-
-flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
-
-flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
-
-flags.DEFINE_boolean(
-    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
-)
-
-flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
-
-flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
-
-flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
-
-flags.DEFINE_integer(
-    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
-)
-
-flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
-
-flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
-
-flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
-
-flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
-
-flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
-
-flags.DEFINE_integer(
-    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
-)
-
-flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
-
-flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
-
-flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
-
-flags.DEFINE_boolean(
-    "eval_all_checkpoints",
-    False,
-    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-)
-
-flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
-
-flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
-
-flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
-
-flags.DEFINE_integer("seed", 42, "random seed for initialization")
-
-flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
-
-flags.DEFINE_string(
-    "gpus",
-    "0",
-    "Comma separated list of gpus devices. If only one, switch to single "
-    "gpu strategy, if None takes all the gpus available.",
-)
-
-
-def train(
-    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
-):
-    if args["max_steps"] > 0:
-        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
-        args["num_train_epochs"] = 1
-    else:
-        num_train_steps = (
-            math.ceil(num_train_examples / train_batch_size)
-            // args["gradient_accumulation_steps"]
-            * args["num_train_epochs"]
-        )
-
-    writer = tf.summary.create_file_writer("/tmp/mylogs")
-
-    with strategy.scope():
-        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
-
-        if args["fp16"]:
-            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
-
-        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
-        gradient_accumulator = GradientAccumulator()
-
-    logging.info("***** Running training *****")
-    logging.info("  Num examples = %d", num_train_examples)
-    logging.info("  Num Epochs = %d", args["num_train_epochs"])
-    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
-    logging.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        train_batch_size * args["gradient_accumulation_steps"],
-    )
-    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
-    logging.info("  Total training steps = %d", num_train_steps)
-
-    model.summary()
-
-    @tf.function
-    def apply_gradients():
-        grads_and_vars = []
-
-        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
-            if gradient is not None:
-                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
-                grads_and_vars.append((scaled_gradient, variable))
-            else:
-                grads_and_vars.append((gradient, variable))
-
-        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
-        gradient_accumulator.reset()
-
-    @tf.function
-    def train_step(train_features, train_labels):
-        def step_fn(train_features, train_labels):
-            inputs = {"attention_mask": train_features["input_mask"], "training": True}
-
-            if args["model_type"] != "distilbert":
-                inputs["token_type_ids"] = (
-                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
-                )
-
-            with tf.GradientTape() as tape:
-                logits = model(train_features["input_ids"], **inputs)[0]
-                logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features["input_mask"], (-1,))
-                active_logits = tf.boolean_mask(logits, active_loss)
-                train_labels = tf.reshape(train_labels, (-1,))
-                active_labels = tf.boolean_mask(train_labels, active_loss)
-                cross_entropy = loss_fct(active_labels, active_logits)
-                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
-                grads = tape.gradient(loss, model.trainable_variables)
-
-                gradient_accumulator(grads)
-
-            return cross_entropy
-
-        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
-        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
-
-        return mean_loss
-
-    current_time = datetime.datetime.now()
-    train_iterator = master_bar(range(args["num_train_epochs"]))
-    global_step = 0
-    logging_loss = 0.0
-
-    for epoch in train_iterator:
-        epoch_iterator = progress_bar(
-            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
-        )
-        step = 1
-
-        with strategy.scope():
-            for train_features, train_labels in epoch_iterator:
-                loss = train_step(train_features, train_labels)
-
-                if step % args["gradient_accumulation_steps"] == 0:
-                    strategy.experimental_run_v2(apply_gradients)
-
-                    loss_metric(loss)
-
-                    global_step += 1
-
-                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
-                        # Log metrics
-                        if (
-                            args["n_device"] == 1 and args["evaluate_during_training"]
-                        ):  # Only evaluate when single GPU otherwise metrics may not average well
-                            y_true, y_pred, eval_loss = evaluate(
-                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
-                            )
-                            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-                            logging.info("Eval at step " + str(global_step) + "\n" + report)
-                            logging.info("eval_loss: " + str(eval_loss))
-
-                            precision = metrics.precision_score(y_true, y_pred)
-                            recall = metrics.recall_score(y_true, y_pred)
-                            f1 = metrics.f1_score(y_true, y_pred)
-
-                            with writer.as_default():
-                                tf.summary.scalar("eval_loss", eval_loss, global_step)
-                                tf.summary.scalar("precision", precision, global_step)
-                                tf.summary.scalar("recall", recall, global_step)
-                                tf.summary.scalar("f1", f1, global_step)
-
-                        lr = optimizer.learning_rate
-                        learning_rate = lr(step)
-
-                        with writer.as_default():
-                            tf.summary.scalar("lr", learning_rate, global_step)
-                            tf.summary.scalar(
-                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
-                            )
-
-                        logging_loss = loss_metric.result()
-
-                    with writer.as_default():
-                        tf.summary.scalar("loss", loss_metric.result(), step=step)
-
-                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
-                        # Save model checkpoint
-                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
-
-                        if not os.path.exists(output_dir):
-                            os.makedirs(output_dir)
-
-                        model.save_pretrained(output_dir)
-                        logging.info("Saving model checkpoint to %s", output_dir)
-
-                train_iterator.child.comment = f"loss : {loss_metric.result()}"
-                step += 1
-
-        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
-
-        loss_metric.reset_states()
-
-    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
-
-
-def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
-    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
-    eval_dataset, size = load_and_cache_examples(
-        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
-    )
-    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
-    preds = None
-    num_eval_steps = math.ceil(size / eval_batch_size)
-    master = master_bar(range(1))
-    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
-    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-    loss = 0.0
-
-    logging.info("***** Running evaluation *****")
-    logging.info("  Num examples = %d", size)
-    logging.info("  Batch size = %d", eval_batch_size)
-
-    for eval_features, eval_labels in eval_iterator:
-        inputs = {"attention_mask": eval_features["input_mask"], "training": False}
-
-        if args["model_type"] != "distilbert":
-            inputs["token_type_ids"] = (
-                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
-            )
-
-        with strategy.scope():
-            logits = model(eval_features["input_ids"], **inputs)[0]
-            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
-            active_logits = tf.boolean_mask(tmp_logits, active_loss)
-            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
-            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
-            cross_entropy = loss_fct(active_labels, active_logits)
-            loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
-
-        if preds is None:
-            preds = logits.numpy()
-            label_ids = eval_labels.numpy()
-        else:
-            preds = np.append(preds, logits.numpy(), axis=0)
-            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
-
-    preds = np.argmax(preds, axis=2)
-    y_pred = [[] for _ in range(label_ids.shape[0])]
-    y_true = [[] for _ in range(label_ids.shape[0])]
-    loss = loss / num_eval_steps
-
-    for i in range(label_ids.shape[0]):
-        for j in range(label_ids.shape[1]):
-            if label_ids[i, j] != pad_token_label_id:
-                y_pred[i].append(labels[preds[i, j] - 1])
-                y_true[i].append(labels[label_ids[i, j] - 1])
-
-    return y_true, y_pred, loss.numpy()
-
-
-def load_cache(cached_file, max_seq_length):
-    name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-    }
-
-    def _decode_record(record):
-        example = tf.io.parse_single_example(record, name_to_features)
-        features = {}
-        features["input_ids"] = example["input_ids"]
-        features["input_mask"] = example["input_mask"]
-        features["segment_ids"] = example["segment_ids"]
-
-        return features, example["label_ids"]
-
-    d = tf.data.TFRecordDataset(cached_file)
-    d = d.map(_decode_record, num_parallel_calls=4)
-    count = d.reduce(0, lambda x, _: x + 1)
-
-    return d, count.numpy()
-
-
-def save_cache(features, cached_features_file):
-    writer = tf.io.TFRecordWriter(cached_features_file)
-
-    for (ex_index, feature) in enumerate(features):
-        if ex_index % 5000 == 0:
-            logging.info("Writing example %d of %d" % (ex_index, len(features)))
-
-        def create_int_feature(values):
-            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-            return f
-
-        record_feature = collections.OrderedDict()
-        record_feature["input_ids"] = create_int_feature(feature.input_ids)
-        record_feature["input_mask"] = create_int_feature(feature.input_mask)
-        record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
-        record_feature["label_ids"] = create_int_feature(feature.label_ids)
-
-        tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
-
-        writer.write(tf_example.SerializeToString())
-
-    writer.close()
-
-
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
-    drop_remainder = True if args["tpu"] or mode == "train" else False
-
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args["data_dir"],
-        "cached_{}_{}_{}.tf_record".format(
-            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
-        logging.info("Loading features from cached file %s", cached_features_file)
-        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
-    else:
-        logging.info("Creating features from dataset file at %s", args["data_dir"])
-        examples = read_examples_from_file(args["data_dir"], mode)
-        features = convert_examples_to_features(
-            examples,
-            labels,
-            args["max_seq_length"],
-            tokenizer,
-            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
-            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args["model_type"] in ["roberta"]),
-            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args["model_type"] in ["xlnet"]),
-            # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
-            pad_token_label_id=pad_token_label_id,
-        )
-        logging.info("Saving features into cached file %s", cached_features_file)
-        save_cache(features, cached_features_file)
-        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
-
-    if mode == "train":
-        dataset = dataset.repeat()
-        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
-
-    dataset = dataset.batch(batch_size, drop_remainder)
-    dataset = dataset.prefetch(buffer_size=batch_size)
-
-    return dataset, size
-
-
-def main(_):
-    logging.set_verbosity(logging.INFO)
-    args = flags.FLAGS.flag_values_dict()
-
-    if (
-        os.path.exists(args["output_dir"])
-        and os.listdir(args["output_dir"])
-        and args["do_train"]
-        and not args["overwrite_output_dir"]
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args["output_dir"]
-            )
-        )
-
-    if args["fp16"]:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if args["tpu"]:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
-        tf.config.experimental_connect_to_cluster(resolver)
-        tf.tpu.experimental.initialize_tpu_system(resolver)
-        strategy = tf.distribute.experimental.TPUStrategy(resolver)
-        args["n_device"] = args["num_tpu_cores"]
-    elif len(args["gpus"].split(",")) > 1:
-        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
-        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
-    elif args["no_cuda"]:
-        args["n_device"] = 1
-        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
-    else:
-        args["n_device"] = len(args["gpus"].split(","))
-        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
-
-    logging.warning(
-        "n_device: %s, distributed training: %s, 16-bits training: %s",
-        args["n_device"],
-        bool(args["n_device"] > 1),
-        args["fp16"],
-    )
-
-    labels = get_labels(args["labels"])
-    num_labels = len(labels) + 1
-    pad_token_label_id = 0
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
-    config = config_class.from_pretrained(
-        args["config_name"] if args["config_name"] else args["model_name_or_path"],
-        num_labels=num_labels,
-        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-    )
-
-    logging.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args["do_train"]:
-        tokenizer = tokenizer_class.from_pretrained(
-            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
-            do_lower_case=args["do_lower_case"],
-            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-        )
-
-        with strategy.scope():
-            model = model_class.from_pretrained(
-                args["model_name_or_path"],
-                from_pt=bool(".bin" in args["model_name_or_path"]),
-                config=config,
-                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-            )
-            model.layers[-1].activation = tf.keras.activations.softmax
-
-        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
-        train_dataset, num_train_examples = load_and_cache_examples(
-            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
-        )
-        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-        train(
-            args,
-            strategy,
-            train_dataset,
-            tokenizer,
-            model,
-            num_train_examples,
-            labels,
-            train_batch_size,
-            pad_token_label_id,
-        )
-
-        if not os.path.exists(args["output_dir"]):
-            os.makedirs(args["output_dir"])
-
-        logging.info("Saving model to %s", args["output_dir"])
-
-        model.save_pretrained(args["output_dir"])
-        tokenizer.save_pretrained(args["output_dir"])
-
-    # Evaluation
-    if args["do_eval"]:
-        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
-        checkpoints = []
-        results = []
-
-        if args["eval_all_checkpoints"]:
-            checkpoints = list(
-                os.path.dirname(c)
-                for c in sorted(
-                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
-                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
-                )
-            )
-
-        logging.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        if len(checkpoints) == 0:
-            checkpoints.append(args["output_dir"])
-
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
-
-            with strategy.scope():
-                model = model_class.from_pretrained(checkpoint)
-
-            y_true, y_pred, eval_loss = evaluate(
-                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
-            )
-            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-            if global_step:
-                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
-
-        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
-
-        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
-            for res in results:
-                for key, val in res.items():
-                    if "loss" in key:
-                        logging.info(key + " = " + str(val))
-                        writer.write(key + " = " + str(val))
-                        writer.write("\n")
-                    else:
-                        logging.info(key)
-                        logging.info("\n" + report)
-                        writer.write(key + "\n")
-                        writer.write(report)
-                        writer.write("\n")
-
-    if args["do_predict"]:
-        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
-        model = model_class.from_pretrained(args["output_dir"])
-        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
-        predict_dataset, _ = load_and_cache_examples(
-            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
-        )
-        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
-        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
-        report = metrics.classification_report(y_true, y_pred, digits=4)
-
-        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
-            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-            logging.info("\n" + report)
-
-            writer.write(report)
-            writer.write("\n\nloss = " + str(pred_loss))
-
-        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
-            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
-                example_id = 0
-
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-
-                        if not y_pred[example_id]:
-                            example_id += 1
-                    elif y_pred[example_id]:
-                        output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
-                        writer.write(output_line)
-                    else:
-                        logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
-
-if __name__ == "__main__":
-    flags.mark_flag_as_required("data_dir")
-    flags.mark_flag_as_required("output_dir")
-    flags.mark_flag_as_required("model_name_or_path")
-    flags.mark_flag_as_required("model_type")
-    app.run(main)
diff --git a/server/transformers/examples/run_xnli.py b/server/transformers/examples/run_xnli.py
deleted file mode 100644
index e995d27f1bd945e9c40915e9bdbe94970b6b62c4..0000000000000000000000000000000000000000
--- a/server/transformers/examples/run_xnli.py
+++ /dev/null
@@ -1,653 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
-    Adapted from `examples/run_glue.py`"""
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForSequenceClassification,
-    DistilBertTokenizer,
-    XLMConfig,
-    XLMForSequenceClassification,
-    XLMTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import xnli_compute_metrics as compute_metrics
-from transformers import xnli_output_modes as output_modes
-from transformers import xnli_processors as processors
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert"] else None
-                )  # XLM and DistilBERT don't use segment_ids
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    eval_task_names = (args.task_name,)
-    eval_outputs_dirs = (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert"] else None
-                    )  # XLM and DistilBERT don't use segment_ids
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        else:
-            raise ValueError("No other `output_mode` for XNLI.")
-        result = compute_metrics(eval_task, preds, out_label_ids)
-        results.update(result)
-
-        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task](language=args.language, train_language=args.train_language)
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}_{}".format(
-            "test" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-            str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        examples = (
-            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-            pad_on_left=False,
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=0,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    else:
-        raise ValueError("No other `output_mode` for XNLI.")
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--language",
-        default=None,
-        type=str,
-        required=True,
-        help="Evaluation language. Also train language if `train_language` is set to None.",
-    )
-    parser.add_argument(
-        "--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare XNLI task
-    args.task_name = "xnli"
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name](language=args.language, train_language=args.train_language)
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/summarization/README.md b/server/transformers/examples/summarization/README.md
deleted file mode 100644
index 250c4bcfe8471a85690be3393b1f4e00124a4442..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/README.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Text Summarization with Pretrained Encoders
-
-This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
-
-The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
-
-The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
-
-## Setup
-
-```
-git clone https://github.com/huggingface/transformers && cd transformers
-pip install .
-pip install nltk py-rouge
-cd examples/summarization
-```
-
-## Reproduce the authors' results on ROUGE
-
-To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
-
-```bash
-tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
-```
-
-And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
-
-```bash
-python run_summarization.py \
-    --documents_dir $DATA_PATH \
-    --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --no_cuda false \
-    --batch_size 4 \
-    --min_length 50 \
-    --max_length 200 \
-    --beam_size 5 \
-    --alpha 0.95 \
-    --block_trigram true \
-    --compute_rouge true
-```
-
-The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
-
-## Summarize any text
-
-Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
-
-```bash
-python run_summarization.py \
-    --documents_dir $DATA_PATH \
-    --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --no_cuda false \
-    --batch_size 4 \
-    --min_length 50 \
-    --max_length 200 \
-    --beam_size 5 \
-    --alpha 0.95 \
-    --block_trigram true \
-```
-
-You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
diff --git a/server/transformers/examples/summarization/configuration_bertabs.py b/server/transformers/examples/summarization/configuration_bertabs.py
deleted file mode 100644
index c976180b2fc4d76e29f557e38bcf0708dc4ccbc0..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/configuration_bertabs.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding=utf-8
-# Copyright 2019 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" BertAbs configuration """
-import logging
-
-from transformers import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-BERTABS_FINETUNED_CONFIG_MAP = {
-    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
-}
-
-
-class BertAbsConfig(PretrainedConfig):
-    r""" Class to store the configuration of the BertAbs model.
-
-    Arguments:
-        vocab_size: int
-            Number of tokens in the vocabulary.
-        max_pos: int
-            The maximum sequence length that this model will be used with.
-        enc_layer: int
-            The numner of hidden layers in the Transformer encoder.
-        enc_hidden_size: int
-            The size of the encoder's layers.
-        enc_heads: int
-            The number of attention heads for each attention layer in the encoder.
-        enc_ff_size: int
-            The size of the encoder's feed-forward layers.
-        enc_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
-            embeddings, layers, pooler and also the attention probabilities in
-            the encoder.
-        dec_layer: int
-            The numner of hidden layers in the decoder.
-        dec_hidden_size: int
-            The size of the decoder's layers.
-        dec_heads: int
-            The number of attention heads for each attention layer in the decoder.
-        dec_ff_size: int
-            The size of the decoder's feed-forward layers.
-        dec_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
-            embeddings, layers, pooler and also the attention probabilities in
-            the decoder.
-    """
-
-    pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
-    model_type = "bertabs"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.max_pos = max_pos
-
-        self.enc_layers = enc_layers
-        self.enc_hidden_size = enc_hidden_size
-        self.enc_heads = enc_heads
-        self.enc_ff_size = enc_ff_size
-        self.enc_dropout = enc_dropout
-
-        self.dec_layers = dec_layers
-        self.dec_hidden_size = dec_hidden_size
-        self.dec_heads = dec_heads
-        self.dec_ff_size = dec_ff_size
-        self.dec_dropout = dec_dropout
diff --git a/server/transformers/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/server/transformers/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
deleted file mode 100644
index a1cbd64dd8e9923d11d525e08cab8cd79ef50461..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Convert BertExtAbs's checkpoints.
-
-The script looks like it is doing something trivial but it is not. The "weights"
-proposed by the authors are actually the entire model pickled. We need to load
-the model within the original codebase to be able to only save its `state_dict`.
-"""
-
-import argparse
-import logging
-from collections import namedtuple
-
-import torch
-
-from model_bertabs import BertAbsSummarizer
-from models.model_builder import AbsSummarizer  # The authors' implementation
-from transformers import BertTokenizer
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-BertAbsConfig = namedtuple(
-    "BertAbsConfig",
-    [
-        "temp_dir",
-        "large",
-        "use_bert_emb",
-        "finetune_bert",
-        "encoder",
-        "share_emb",
-        "max_pos",
-        "enc_layers",
-        "enc_hidden_size",
-        "enc_heads",
-        "enc_ff_size",
-        "enc_dropout",
-        "dec_layers",
-        "dec_hidden_size",
-        "dec_heads",
-        "dec_ff_size",
-        "dec_dropout",
-    ],
-)
-
-
-def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
-    of BertAbs for the internal architecture.
-    """
-
-    # Instantiate the authors' model with the pre-trained weights
-    config = BertAbsConfig(
-        temp_dir=".",
-        finetune_bert=False,
-        large=False,
-        share_emb=True,
-        use_bert_emb=False,
-        encoder="bert",
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-    )
-    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
-    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
-    original.eval()
-
-    new_model = BertAbsSummarizer(config, torch.device("cpu"))
-    new_model.eval()
-
-    # -------------------
-    # Convert the weights
-    # -------------------
-
-    logging.info("convert the model")
-    new_model.bert.load_state_dict(original.bert.state_dict())
-    new_model.decoder.load_state_dict(original.decoder.state_dict())
-    new_model.generator.load_state_dict(original.generator.state_dict())
-
-    # ----------------------------------
-    # Make sure the outpus are identical
-    # ----------------------------------
-
-    logging.info("Make sure that the models' outputs are identical")
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    # prepare the model inputs
-    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
-    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
-    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
-    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
-    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
-    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
-
-    # failsafe to make sure the weights reset does not affect the
-    # loaded weights.
-    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
-
-    # forward pass
-    src = encoder_input_ids
-    tgt = decoder_input_ids
-    segs = token_type_ids = None
-    clss = None
-    mask_src = encoder_attention_mask = None
-    mask_tgt = decoder_attention_mask = None
-    mask_cls = None
-
-    # The original model does not apply the geneator layer immediatly but rather in
-    # the beam search (where it combines softmax + linear layer). Since we already
-    # apply the softmax in our generation process we only apply the linear layer here.
-    # We make sure that the outputs of the full stack are identical
-    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
-    output_original_generator = original.generator(output_original_model)
-
-    output_converted_model = new_model(
-        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
-    )[0]
-    output_converted_generator = new_model.generator(output_converted_model)
-
-    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
-    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
-
-    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
-    if are_identical:
-        logging.info("all weights are equal up to 1e-3")
-    else:
-        raise ValueError("the weights are different. The new model is likely different from the original one.")
-
-    # The model has been saved with torch.save(model) and this is bound to the exact
-    # directory structure. We save the state_dict instead.
-    logging.info("saving the model's state dictionary")
-    torch.save(
-        new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-
-    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
-    )
diff --git a/server/transformers/examples/summarization/modeling_bertabs.py b/server/transformers/examples/summarization/modeling_bertabs.py
deleted file mode 100644
index bad412baac1dd38d3bf5742a629ee83a9b6c7b0b..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/modeling_bertabs.py
+++ /dev/null
@@ -1,1027 +0,0 @@
-# MIT License
-
-# Copyright (c) 2019 Yang Liu and the HuggingFace team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import copy
-import math
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.init import xavier_uniform_
-
-from configuration_bertabs import BertAbsConfig
-from transformers import BertConfig, BertModel, PreTrainedModel
-
-
-MAX_SIZE = 5000
-
-BERTABS_FINETUNED_MODEL_MAP = {
-    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin",
-}
-
-
-class BertAbsPreTrainedModel(PreTrainedModel):
-    config_class = BertAbsConfig
-    pretrained_model_archive_map = BERTABS_FINETUNED_MODEL_MAP
-    load_tf_weights = False
-    base_model_prefix = "bert"
-
-
-class BertAbs(BertAbsPreTrainedModel):
-    def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
-        super().__init__(args)
-        self.args = args
-        self.bert = Bert()
-
-        # If pre-trained weights are passed for Bert, load these.
-        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
-        if load_bert_pretrained_extractive:
-            self.bert.model.load_state_dict(
-                dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
-                strict=True,
-            )
-
-        self.vocab_size = self.bert.model.config.vocab_size
-
-        if args.max_pos > 512:
-            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
-            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
-                None, :
-            ].repeat(args.max_pos - 512, 1)
-            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
-
-        tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
-
-        self.decoder = TransformerDecoder(
-            self.args.dec_layers,
-            self.args.dec_hidden_size,
-            heads=self.args.dec_heads,
-            d_ff=self.args.dec_ff_size,
-            dropout=self.args.dec_dropout,
-            embeddings=tgt_embeddings,
-            vocab_size=self.vocab_size,
-        )
-
-        gen_func = nn.LogSoftmax(dim=-1)
-        self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
-        self.generator[0].weight = self.decoder.embeddings.weight
-
-        load_from_checkpoints = False if checkpoint is None else True
-        if load_from_checkpoints:
-            self.load_state_dict(checkpoint)
-
-    def init_weights(self):
-        for module in self.decoder.modules():
-            if isinstance(module, (nn.Linear, nn.Embedding)):
-                module.weight.data.normal_(mean=0.0, std=0.02)
-            elif isinstance(module, nn.LayerNorm):
-                module.bias.data.zero_()
-                module.weight.data.fill_(1.0)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-        for p in self.generator.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-            else:
-                p.data.zero_()
-
-    def forward(
-        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
-    ):
-        encoder_output = self.bert(
-            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
-        )
-        encoder_hidden_states = encoder_output[0]
-        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
-        decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
-        return decoder_outputs
-
-
-class Bert(nn.Module):
-    """ This class is not really necessary and should probably disappear.
-    """
-
-    def __init__(self):
-        super().__init__()
-        config = BertConfig.from_pretrained("bert-base-uncased")
-        self.model = BertModel(config)
-
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
-        self.eval()
-        with torch.no_grad():
-            encoder_outputs, _ = self.model(
-                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
-            )
-        return encoder_outputs
-
-
-class TransformerDecoder(nn.Module):
-    """
-    The Transformer decoder from "Attention is All You Need".
-
-    Args:
-       num_layers (int): number of encoder layers.
-       d_model (int): size of the model
-       heads (int): number of heads
-       d_ff (int): size of the inner FF layer
-       dropout (float): dropout parameters
-       embeddings (:obj:`onmt.modules.Embeddings`):
-          embeddings to use, should have positional encodings
-       attn_type (str): if using a seperate copy attention
-    """
-
-    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
-        super().__init__()
-
-        # Basic attributes.
-        self.decoder_type = "transformer"
-        self.num_layers = num_layers
-        self.embeddings = embeddings
-        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)
-
-        # Build TransformerDecoder.
-        self.transformer_layers = nn.ModuleList(
-            [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
-        )
-
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-
-    # forward(input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask)
-    # def forward(self, input_ids, state, attention_mask=None, memory_lengths=None,
-    # step=None, cache=None, encoder_attention_mask=None, encoder_hidden_states=None, memory_masks=None):
-    def forward(
-        self,
-        input_ids,
-        encoder_hidden_states=None,
-        state=None,
-        attention_mask=None,
-        memory_lengths=None,
-        step=None,
-        cache=None,
-        encoder_attention_mask=None,
-    ):
-        """
-        See :obj:`onmt.modules.RNNDecoderBase.forward()`
-        memory_bank = encoder_hidden_states
-        """
-        # Name conversion
-        tgt = input_ids
-        memory_bank = encoder_hidden_states
-        memory_mask = encoder_attention_mask
-
-        # src_words = state.src
-        src_words = state.src
-        src_batch, src_len = src_words.size()
-
-        padding_idx = self.embeddings.padding_idx
-
-        # Decoder padding mask
-        tgt_words = tgt
-        tgt_batch, tgt_len = tgt_words.size()
-        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
-
-        # Encoder padding mask
-        if memory_mask is not None:
-            src_len = memory_mask.size(-1)
-            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
-        else:
-            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)
-
-        # Pass through the embeddings
-        emb = self.embeddings(input_ids)
-        output = self.pos_emb(emb, step)
-        assert emb.dim() == 3  # len x batch x embedding_dim
-
-        if state.cache is None:
-            saved_inputs = []
-
-        for i in range(self.num_layers):
-            prev_layer_input = None
-            if state.cache is None:
-                if state.previous_input is not None:
-                    prev_layer_input = state.previous_layer_inputs[i]
-
-            output, all_input = self.transformer_layers[i](
-                output,
-                memory_bank,
-                src_pad_mask,
-                tgt_pad_mask,
-                previous_input=prev_layer_input,
-                layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
-                step=step,
-            )
-            if state.cache is None:
-                saved_inputs.append(all_input)
-
-        if state.cache is None:
-            saved_inputs = torch.stack(saved_inputs)
-
-        output = self.layer_norm(output)
-
-        if state.cache is None:
-            state = state.update_state(tgt, saved_inputs)
-
-        # Decoders in transformers return a tuple. Beam search will fail
-        # if we don't follow this convention.
-        return output, state  # , state
-
-    def init_decoder_state(self, src, memory_bank, with_cache=False):
-        """ Init decoder state """
-        state = TransformerDecoderState(src)
-        if with_cache:
-            state._init_cache(memory_bank, self.num_layers)
-        return state
-
-
-class PositionalEncoding(nn.Module):
-    def __init__(self, dropout, dim, max_len=5000):
-        pe = torch.zeros(max_len, dim)
-        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
-        pe[:, 0::2] = torch.sin(position.float() * div_term)
-        pe[:, 1::2] = torch.cos(position.float() * div_term)
-        pe = pe.unsqueeze(0)
-        super().__init__()
-        self.register_buffer("pe", pe)
-        self.dropout = nn.Dropout(p=dropout)
-        self.dim = dim
-
-    def forward(self, emb, step=None):
-        emb = emb * math.sqrt(self.dim)
-        if step:
-            emb = emb + self.pe[:, step][:, None, :]
-
-        else:
-            emb = emb + self.pe[:, : emb.size(1)]
-        emb = self.dropout(emb)
-        return emb
-
-    def get_emb(self, emb):
-        return self.pe[:, : emb.size(1)]
-
-
-class TransformerDecoderLayer(nn.Module):
-    """
-    Args:
-      d_model (int): the dimension of keys/values/queries in
-                       MultiHeadedAttention, also the input size of
-                       the first-layer of the PositionwiseFeedForward.
-      heads (int): the number of heads for MultiHeadedAttention.
-      d_ff (int): the second-layer of the PositionwiseFeedForward.
-      dropout (float): dropout probability(0-1.0).
-      self_attn_type (string): type of self-attention scaled-dot, average
-    """
-
-    def __init__(self, d_model, heads, d_ff, dropout):
-        super().__init__()
-
-        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
-
-        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
-        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
-        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
-        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
-        self.drop = nn.Dropout(dropout)
-        mask = self._get_attn_subsequent_mask(MAX_SIZE)
-        # Register self.mask as a buffer in TransformerDecoderLayer, so
-        # it gets TransformerDecoderLayer's cuda behavior automatically.
-        self.register_buffer("mask", mask)
-
-    def forward(
-        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
-    ):
-        """
-        Args:
-            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
-            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
-            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
-            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
-
-        Returns:
-            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
-
-            * output `[batch_size x 1 x model_dim]`
-            * attn `[batch_size x 1 x src_len]`
-            * all_input `[batch_size x current_step x model_dim]`
-
-        """
-        dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
-        input_norm = self.layer_norm_1(inputs)
-        all_input = input_norm
-        if previous_input is not None:
-            all_input = torch.cat((previous_input, input_norm), dim=1)
-            dec_mask = None
-
-        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
-
-        query = self.drop(query) + inputs
-
-        query_norm = self.layer_norm_2(query)
-        mid = self.context_attn(
-            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
-        )
-        output = self.feed_forward(self.drop(mid) + query)
-
-        return output, all_input
-        # return output
-
-    def _get_attn_subsequent_mask(self, size):
-        """
-        Get an attention mask to avoid using the subsequent info.
-
-        Args:
-            size: int
-
-        Returns:
-            (`LongTensor`):
-
-            * subsequent_mask `[1 x size x size]`
-        """
-        attn_shape = (1, size, size)
-        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
-        subsequent_mask = torch.from_numpy(subsequent_mask)
-        return subsequent_mask
-
-
-class MultiHeadedAttention(nn.Module):
-    """
-    Multi-Head Attention module from
-    "Attention is All You Need"
-    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
-
-    Similar to standard `dot` attention but uses
-    multiple attention distributions simulataneously
-    to select relevant items.
-
-    .. mermaid::
-
-       graph BT
-          A[key]
-          B[value]
-          C[query]
-          O[output]
-          subgraph Attn
-            D[Attn 1]
-            E[Attn 2]
-            F[Attn N]
-          end
-          A --> D
-          C --> D
-          A --> E
-          C --> E
-          A --> F
-          C --> F
-          D --> O
-          E --> O
-          F --> O
-          B --> O
-
-    Also includes several additional tricks.
-
-    Args:
-       head_count (int): number of parallel heads
-       model_dim (int): the dimension of keys/values/queries,
-           must be divisible by head_count
-       dropout (float): dropout parameter
-    """
-
-    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
-        assert model_dim % head_count == 0
-        self.dim_per_head = model_dim // head_count
-        self.model_dim = model_dim
-
-        super().__init__()
-        self.head_count = head_count
-
-        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.softmax = nn.Softmax(dim=-1)
-        self.dropout = nn.Dropout(dropout)
-        self.use_final_linear = use_final_linear
-        if self.use_final_linear:
-            self.final_linear = nn.Linear(model_dim, model_dim)
-
-    def forward(
-        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
-    ):
-        """
-        Compute the context vector and the attention vectors.
-
-        Args:
-           key (`FloatTensor`): set of `key_len`
-                key vectors `[batch, key_len, dim]`
-           value (`FloatTensor`): set of `key_len`
-                value vectors `[batch, key_len, dim]`
-           query (`FloatTensor`): set of `query_len`
-                 query vectors  `[batch, query_len, dim]`
-           mask: binary mask indicating which keys have
-                 non-zero attention `[batch, query_len, key_len]`
-        Returns:
-           (`FloatTensor`, `FloatTensor`) :
-
-           * output context vectors `[batch, query_len, dim]`
-           * one of the attention vectors `[batch, query_len, key_len]`
-        """
-        batch_size = key.size(0)
-        dim_per_head = self.dim_per_head
-        head_count = self.head_count
-
-        def shape(x):
-            """  projection """
-            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
-
-        def unshape(x):
-            """  compute context """
-            return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
-
-        # 1) Project key, value, and query.
-        if layer_cache is not None:
-            if type == "self":
-                query, key, value = (
-                    self.linear_query(query),
-                    self.linear_keys(query),
-                    self.linear_values(query),
-                )
-
-                key = shape(key)
-                value = shape(value)
-
-                if layer_cache is not None:
-                    device = key.device
-                    if layer_cache["self_keys"] is not None:
-                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
-                    if layer_cache["self_values"] is not None:
-                        value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
-                    layer_cache["self_keys"] = key
-                    layer_cache["self_values"] = value
-            elif type == "context":
-                query = self.linear_query(query)
-                if layer_cache is not None:
-                    if layer_cache["memory_keys"] is None:
-                        key, value = self.linear_keys(key), self.linear_values(value)
-                        key = shape(key)
-                        value = shape(value)
-                    else:
-                        key, value = (
-                            layer_cache["memory_keys"],
-                            layer_cache["memory_values"],
-                        )
-                    layer_cache["memory_keys"] = key
-                    layer_cache["memory_values"] = value
-                else:
-                    key, value = self.linear_keys(key), self.linear_values(value)
-                    key = shape(key)
-                    value = shape(value)
-        else:
-            key = self.linear_keys(key)
-            value = self.linear_values(value)
-            query = self.linear_query(query)
-            key = shape(key)
-            value = shape(value)
-
-        query = shape(query)
-
-        # 2) Calculate and scale scores.
-        query = query / math.sqrt(dim_per_head)
-        scores = torch.matmul(query, key.transpose(2, 3))
-
-        if mask is not None:
-            mask = mask.unsqueeze(1).expand_as(scores)
-            scores = scores.masked_fill(mask, -1e18)
-
-        # 3) Apply attention dropout and compute context vectors.
-
-        attn = self.softmax(scores)
-
-        if predefined_graph_1 is not None:
-            attn_masked = attn[:, -1] * predefined_graph_1
-            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
-
-            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
-
-        drop_attn = self.dropout(attn)
-        if self.use_final_linear:
-            context = unshape(torch.matmul(drop_attn, value))
-            output = self.final_linear(context)
-            return output
-        else:
-            context = torch.matmul(drop_attn, value)
-            return context
-
-
-class DecoderState(object):
-    """Interface for grouping together the current state of a recurrent
-    decoder. In the simplest case just represents the hidden state of
-    the model.  But can also be used for implementing various forms of
-    input_feeding and non-recurrent models.
-
-    Modules need to implement this to utilize beam search decoding.
-    """
-
-    def detach(self):
-        """ Need to document this """
-        self.hidden = tuple([_.detach() for _ in self.hidden])
-        self.input_feed = self.input_feed.detach()
-
-    def beam_update(self, idx, positions, beam_size):
-        """ Need to document this """
-        for e in self._all:
-            sizes = e.size()
-            br = sizes[1]
-            if len(sizes) == 3:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
-            else:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]
-
-            sent_states.data.copy_(sent_states.data.index_select(1, positions))
-
-    def map_batch_fn(self, fn):
-        raise NotImplementedError()
-
-
-class TransformerDecoderState(DecoderState):
-    """ Transformer Decoder state base class """
-
-    def __init__(self, src):
-        """
-        Args:
-            src (FloatTensor): a sequence of source words tensors
-                    with optional feature tensors, of size (len x batch).
-        """
-        self.src = src
-        self.previous_input = None
-        self.previous_layer_inputs = None
-        self.cache = None
-
-    @property
-    def _all(self):
-        """
-        Contains attributes that need to be updated in self.beam_update().
-        """
-        if self.previous_input is not None and self.previous_layer_inputs is not None:
-            return (self.previous_input, self.previous_layer_inputs, self.src)
-        else:
-            return (self.src,)
-
-    def detach(self):
-        if self.previous_input is not None:
-            self.previous_input = self.previous_input.detach()
-        if self.previous_layer_inputs is not None:
-            self.previous_layer_inputs = self.previous_layer_inputs.detach()
-        self.src = self.src.detach()
-
-    def update_state(self, new_input, previous_layer_inputs):
-        state = TransformerDecoderState(self.src)
-        state.previous_input = new_input
-        state.previous_layer_inputs = previous_layer_inputs
-        return state
-
-    def _init_cache(self, memory_bank, num_layers):
-        self.cache = {}
-
-        for l in range(num_layers):
-            layer_cache = {"memory_keys": None, "memory_values": None}
-            layer_cache["self_keys"] = None
-            layer_cache["self_values"] = None
-            self.cache["layer_{}".format(l)] = layer_cache
-
-    def repeat_beam_size_times(self, beam_size):
-        """ Repeat beam_size times along batch dimension. """
-        self.src = self.src.data.repeat(1, beam_size, 1)
-
-    def map_batch_fn(self, fn):
-        def _recursive_map(struct, batch_dim=0):
-            for k, v in struct.items():
-                if v is not None:
-                    if isinstance(v, dict):
-                        _recursive_map(v)
-                    else:
-                        struct[k] = fn(v, batch_dim)
-
-        self.src = fn(self.src, 0)
-        if self.cache is not None:
-            _recursive_map(self.cache)
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-class PositionwiseFeedForward(nn.Module):
-    """ A two-layer Feed-Forward-Network with residual layer norm.
-
-    Args:
-        d_model (int): the size of input for the first-layer of the FFN.
-        d_ff (int): the hidden layer size of the second-layer
-            of the FNN.
-        dropout (float): dropout probability in :math:`[0, 1)`.
-    """
-
-    def __init__(self, d_model, d_ff, dropout=0.1):
-        super().__init__()
-        self.w_1 = nn.Linear(d_model, d_ff)
-        self.w_2 = nn.Linear(d_ff, d_model)
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.actv = gelu
-        self.dropout_1 = nn.Dropout(dropout)
-        self.dropout_2 = nn.Dropout(dropout)
-
-    def forward(self, x):
-        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
-        output = self.dropout_2(self.w_2(inter))
-        return output + x
-
-
-#
-# TRANSLATOR
-# The following code is used to generate summaries using the
-# pre-trained weights and beam search.
-#
-
-
-def build_predictor(args, tokenizer, symbols, model, logger=None):
-    # we should be able to refactor the global scorer a lot
-    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
-    translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
-    return translator
-
-
-class GNMTGlobalScorer(object):
-    """
-    NMT re-ranking score from
-    "Google's Neural Machine Translation System" :cite:`wu2016google`
-
-    Args:
-       alpha (float): length parameter
-       beta (float):  coverage parameter
-    """
-
-    def __init__(self, alpha, length_penalty):
-        self.alpha = alpha
-        penalty_builder = PenaltyBuilder(length_penalty)
-        self.length_penalty = penalty_builder.length_penalty()
-
-    def score(self, beam, logprobs):
-        """
-        Rescores a prediction based on penalty functions
-        """
-        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
-        return normalized_probs
-
-
-class PenaltyBuilder(object):
-    """
-    Returns the Length and Coverage Penalty function for Beam Search.
-
-    Args:
-        length_pen (str): option name of length pen
-        cov_pen (str): option name of cov pen
-    """
-
-    def __init__(self, length_pen):
-        self.length_pen = length_pen
-
-    def length_penalty(self):
-        if self.length_pen == "wu":
-            return self.length_wu
-        elif self.length_pen == "avg":
-            return self.length_average
-        else:
-            return self.length_none
-
-    """
-    Below are all the different penalty terms implemented so far
-    """
-
-    def length_wu(self, beam, logprobs, alpha=0.0):
-        """
-        NMT length re-ranking score from
-        "Google's Neural Machine Translation System" :cite:`wu2016google`.
-        """
-
-        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)
-        return logprobs / modifier
-
-    def length_average(self, beam, logprobs, alpha=0.0):
-        """
-        Returns the average probability of tokens in a sequence.
-        """
-        return logprobs / len(beam.next_ys)
-
-    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
-        """
-        Returns unmodified scores.
-        """
-        return logprobs
-
-
-class Translator(object):
-    """
-    Uses a model to translate a batch of sentences.
-
-    Args:
-       model (:obj:`onmt.modules.NMTModel`):
-          NMT model to use for translation
-       fields (dict of Fields): data fields
-       beam_size (int): size of beam to use
-       n_best (int): number of translations produced
-       max_length (int): maximum length output to produce
-       global_scores (:obj:`GlobalScorer`):
-         object to rescore final translations
-       copy_attn (bool): use copy attention during translation
-       beam_trace (bool): trace beam search for debugging
-       logger(logging.Logger): logger.
-    """
-
-    def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
-        self.logger = logger
-
-        self.args = args
-        self.model = model
-        self.generator = self.model.generator
-        self.vocab = vocab
-        self.symbols = symbols
-        self.start_token = symbols["BOS"]
-        self.end_token = symbols["EOS"]
-
-        self.global_scorer = global_scorer
-        self.beam_size = args.beam_size
-        self.min_length = args.min_length
-        self.max_length = args.max_length
-
-    def translate(self, batch, step, attn_debug=False):
-        """ Generates summaries from one batch of data.
-        """
-        self.model.eval()
-        with torch.no_grad():
-            batch_data = self.translate_batch(batch)
-            translations = self.from_batch(batch_data)
-        return translations
-
-    def translate_batch(self, batch, fast=False):
-        """
-        Translate a batch of sentences.
-
-        Mostly a wrapper around :obj:`Beam`.
-
-        Args:
-           batch (:obj:`Batch`): a batch from a dataset object
-           data (:obj:`Dataset`): the dataset object
-           fast (bool): enables fast beam search (may not support all features)
-
-        Todo:
-           Shouldn't need the original dataset.
-        """
-        with torch.no_grad():
-            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
-
-    # Where the beam search lives
-    # I have no idea why it is being called from the method above
-    def _fast_translate_batch(self, batch, max_length, min_length=0):
-        """ Beam Search using the encoder inputs contained in `batch`.
-        """
-
-        # The batch object is funny
-        # Instead of just looking at the size of the arguments we encapsulate
-        # a size argument.
-        # Where is it defined?
-        beam_size = self.beam_size
-        batch_size = batch.batch_size
-        src = batch.src
-        segs = batch.segs
-        mask_src = batch.mask_src
-
-        src_features = self.model.bert(src, segs, mask_src)
-        dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
-        device = src_features.device
-
-        # Tile states and memory beam_size times.
-        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
-        src_features = tile(src_features, beam_size, dim=0)
-        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
-        alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)
-
-        # Give full probability to the first beam on the first step.
-        topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)
-
-        # Structure that holds finished hypotheses.
-        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
-
-        results = {}
-        results["predictions"] = [[] for _ in range(batch_size)]  # noqa: F812
-        results["scores"] = [[] for _ in range(batch_size)]  # noqa: F812
-        results["gold_score"] = [0] * batch_size
-        results["batch"] = batch
-
-        for step in range(max_length):
-            decoder_input = alive_seq[:, -1].view(1, -1)
-
-            # Decoder forward.
-            decoder_input = decoder_input.transpose(0, 1)
-
-            dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)
-
-            # Generator forward.
-            log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
-            vocab_size = log_probs.size(-1)
-
-            if step < min_length:
-                log_probs[:, self.end_token] = -1e20
-
-            # Multiply probs by the beam probability.
-            log_probs += topk_log_probs.view(-1).unsqueeze(1)
-
-            alpha = self.global_scorer.alpha
-            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha
-
-            # Flatten probs into a list of possibilities.
-            curr_scores = log_probs / length_penalty
-
-            if self.args.block_trigram:
-                cur_len = alive_seq.size(1)
-                if cur_len > 3:
-                    for i in range(alive_seq.size(0)):
-                        fail = False
-                        words = [int(w) for w in alive_seq[i]]
-                        words = [self.vocab.ids_to_tokens[w] for w in words]
-                        words = " ".join(words).replace(" ##", "").split()
-                        if len(words) <= 3:
-                            continue
-                        trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
-                        trigram = tuple(trigrams[-1])
-                        if trigram in trigrams[:-1]:
-                            fail = True
-                        if fail:
-                            curr_scores[i] = -10e20
-
-            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
-            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
-
-            # Recover log probs.
-            topk_log_probs = topk_scores * length_penalty
-
-            # Resolve beam origin and true word ids.
-            topk_beam_index = topk_ids.div(vocab_size)
-            topk_ids = topk_ids.fmod(vocab_size)
-
-            # Map beam_index to batch_index in the flat representation.
-            batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
-            select_indices = batch_index.view(-1)
-
-            # Append last prediction.
-            alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)
-
-            is_finished = topk_ids.eq(self.end_token)
-            if step + 1 == max_length:
-                is_finished.fill_(1)
-            # End condition is top beam is finished.
-            end_condition = is_finished[:, 0].eq(1)
-            # Save finished hypotheses.
-            if is_finished.any():
-                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
-                for i in range(is_finished.size(0)):
-                    b = batch_offset[i]
-                    if end_condition[i]:
-                        is_finished[i].fill_(1)
-                    finished_hyp = is_finished[i].nonzero().view(-1)
-                    # Store finished hypotheses for this batch.
-                    for j in finished_hyp:
-                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))
-                    # If the batch reached the end, save the n_best hypotheses.
-                    if end_condition[i]:
-                        best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True)
-                        score, pred = best_hyp[0]
-
-                        results["scores"][b].append(score)
-                        results["predictions"][b].append(pred)
-                non_finished = end_condition.eq(0).nonzero().view(-1)
-                # If all sentences are translated, no need to go further.
-                if len(non_finished) == 0:
-                    break
-                # Remove finished batches for the next step.
-                topk_log_probs = topk_log_probs.index_select(0, non_finished)
-                batch_index = batch_index.index_select(0, non_finished)
-                batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
-            # Reorder states.
-            select_indices = batch_index.view(-1)
-            src_features = src_features.index_select(0, select_indices)
-            dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))
-
-        return results
-
-    def from_batch(self, translation_batch):
-        batch = translation_batch["batch"]
-        assert len(translation_batch["gold_score"]) == len(translation_batch["predictions"])
-        batch_size = batch.batch_size
-
-        preds, _, _, tgt_str, src = (
-            translation_batch["predictions"],
-            translation_batch["scores"],
-            translation_batch["gold_score"],
-            batch.tgt_str,
-            batch.src,
-        )
-
-        translations = []
-        for b in range(batch_size):
-            pred_sents = self.vocab.convert_ids_to_tokens([int(n) for n in preds[b][0]])
-            pred_sents = " ".join(pred_sents).replace(" ##", "")
-            gold_sent = " ".join(tgt_str[b].split())
-            raw_src = [self.vocab.ids_to_tokens[int(t)] for t in src[b]][:500]
-            raw_src = " ".join(raw_src)
-            translation = (pred_sents, gold_sent, raw_src)
-            translations.append(translation)
-
-        return translations
-
-
-def tile(x, count, dim=0):
-    """
-    Tiles x on dimension dim count times.
-    """
-    perm = list(range(len(x.size())))
-    if dim != 0:
-        perm[0], perm[dim] = perm[dim], perm[0]
-        x = x.permute(perm).contiguous()
-    out_size = list(x.size())
-    out_size[0] *= count
-    batch = x.size(0)
-    x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
-    if dim != 0:
-        x = x.permute(perm).contiguous()
-    return x
-
-
-#
-# Optimizer for training. We keep this here in case we want to add
-# a finetuning script.
-#
-
-
-class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
-
-    As described in [1], the authors fine-tune BertSum for abstractive
-    summarization using two Adam Optimizers with different warm-up steps and
-    learning rate. They also use a custom learning rate scheduler.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    """
-
-    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
-        self.encoder = model.encoder
-        self.decoder = model.decoder
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-
-        self.optimizers = {
-            "encoder": torch.optim.Adam(
-                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
-            ),
-            "decoder": torch.optim.Adam(
-                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
-            ),
-        }
-
-        self._step = 0
-        self.current_learning_rates = {}
-
-    def _update_rate(self, stack):
-        return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))
-
-    def zero_grad(self):
-        self.optimizer_decoder.zero_grad()
-        self.optimizer_encoder.zero_grad()
-
-    def step(self):
-        self._step += 1
-        for stack, optimizer in self.optimizers.items():
-            new_rate = self._update_rate(stack)
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = new_rate
-            optimizer.step()
-            self.current_learning_rates[stack] = new_rate
diff --git a/server/transformers/examples/summarization/requirements.txt b/server/transformers/examples/summarization/requirements.txt
deleted file mode 100644
index f984af489cfc4f7210524cd4efce58766404e04c..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-transformers
-
-# For ROUGE
-nltk
-py-rouge
diff --git a/server/transformers/examples/summarization/run_summarization.py b/server/transformers/examples/summarization/run_summarization.py
deleted file mode 100644
index 4afa97b5a963a909d9f1465dbd5f96e1f23c7987..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/run_summarization.py
+++ /dev/null
@@ -1,323 +0,0 @@
-#! /usr/bin/python3
-import argparse
-import logging
-import os
-import sys
-from collections import namedtuple
-
-import torch
-from torch.utils.data import DataLoader, SequentialSampler
-from tqdm import tqdm
-
-from modeling_bertabs import BertAbs, build_predictor
-from transformers import BertTokenizer
-from utils_summarization import (
-    SummarizationDataset,
-    build_mask,
-    compute_token_type_ids,
-    encode_for_summarization,
-    fit_to_block_size,
-)
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
-
-
-def evaluate(args):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
-    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
-    model.to(args.device)
-    model.eval()
-
-    symbols = {
-        "BOS": tokenizer.vocab["[unused0]"],
-        "EOS": tokenizer.vocab["[unused1]"],
-        "PAD": tokenizer.vocab["[PAD]"],
-    }
-
-    if args.compute_rouge:
-        reference_summaries = []
-        generated_summaries = []
-
-        import rouge
-        import nltk
-
-        nltk.download("punkt")
-        rouge_evaluator = rouge.Rouge(
-            metrics=["rouge-n", "rouge-l"],
-            max_n=2,
-            limit_length=True,
-            length_limit=args.beam_size,
-            length_limit_type="words",
-            apply_avg=True,
-            apply_best=False,
-            alpha=0.5,  # Default F1_score
-            weight_factor=1.2,
-            stemming=True,
-        )
-
-    # these (unused) arguments are defined to keep the compatibility
-    # with the legacy code and will be deleted in a next iteration.
-    args.result_path = ""
-    args.temp_dir = ""
-
-    data_iterator = build_data_iterator(args, tokenizer)
-    predictor = build_predictor(args, tokenizer, symbols, model)
-
-    logger.info("***** Running evaluation *****")
-    logger.info("  Number examples = %d", len(data_iterator.dataset))
-    logger.info("  Batch size = %d", args.batch_size)
-    logger.info("")
-    logger.info("***** Beam Search parameters *****")
-    logger.info("  Beam size = %d", args.beam_size)
-    logger.info("  Minimum length = %d", args.min_length)
-    logger.info("  Maximum length = %d", args.max_length)
-    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
-    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
-
-    for batch in tqdm(data_iterator):
-        batch_data = predictor.translate_batch(batch)
-        translations = predictor.from_batch(batch_data)
-        summaries = [format_summary(t) for t in translations]
-        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
-
-        if args.compute_rouge:
-            reference_summaries += batch.tgt_str
-            generated_summaries += summaries
-
-    if args.compute_rouge:
-        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
-        str_scores = format_rouge_scores(scores)
-        save_rouge_scores(str_scores)
-        print(str_scores)
-
-
-def save_summaries(summaries, path, original_document_name):
-    """ Write the summaries in fies that are prefixed by the original
-    files' name with the `_summary` appended.
-
-    Attributes:
-        original_document_names: List[string]
-            Name of the document that was summarized.
-        path: string
-            Path were the summaries will be written
-        summaries: List[string]
-            The summaries that we produced.
-    """
-    for summary, document_name in zip(summaries, original_document_name):
-        # Prepare the summary file's name
-        if "." in document_name:
-            bare_document_name = ".".join(document_name.split(".")[:-1])
-            extension = document_name.split(".")[-1]
-            name = bare_document_name + "_summary." + extension
-        else:
-            name = document_name + "_summary"
-
-        file_path = os.path.join(path, name)
-        with open(file_path, "w") as output:
-            output.write(summary)
-
-
-def format_summary(translation):
-    """ Transforms the output of the `from_batch` function
-    into nicely formatted summaries.
-    """
-    raw_summary, _, _ = translation
-    summary = (
-        raw_summary.replace("[unused0]", "")
-        .replace("[unused3]", "")
-        .replace("[PAD]", "")
-        .replace("[unused1]", "")
-        .replace(r" +", " ")
-        .replace(" [unused2] ", ". ")
-        .replace("[unused2]", "")
-        .strip()
-    )
-
-    return summary
-
-
-def format_rouge_scores(scores):
-    return """\n
-****** ROUGE SCORES ******
-
-** ROUGE 1
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}
-
-** ROUGE 2
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}
-
-** ROUGE L
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}""".format(
-        scores["rouge-1"]["f"],
-        scores["rouge-1"]["p"],
-        scores["rouge-1"]["r"],
-        scores["rouge-2"]["f"],
-        scores["rouge-2"]["p"],
-        scores["rouge-2"]["r"],
-        scores["rouge-l"]["f"],
-        scores["rouge-l"]["p"],
-        scores["rouge-l"]["r"],
-    )
-
-
-def save_rouge_scores(str_scores):
-    with open("rouge_scores.txt", "w") as output:
-        output.write(str_scores)
-
-
-#
-# LOAD the dataset
-#
-
-
-def build_data_iterator(args, tokenizer):
-    dataset = load_and_cache_examples(args, tokenizer)
-    sampler = SequentialSampler(dataset)
-
-    def collate_fn(data):
-        return collate(data, tokenizer, block_size=512, device=args.device)
-
-    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
-
-    return iterator
-
-
-def load_and_cache_examples(args, tokenizer):
-    dataset = SummarizationDataset(args.documents_dir)
-    return dataset
-
-
-def collate(data, tokenizer, block_size, device):
-    """ Collate formats the data passed to the data loader.
-
-    In particular we tokenize the data batch after batch to avoid keeping them
-    all in memory. We output the data as a namedtuple to fit the original BertAbs's
-    API.
-    """
-    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
-    names = [name for name, _, _ in data]
-    summaries = [" ".join(summary_list) for _, _, summary_list in data]
-
-    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
-    encoded_stories = torch.tensor(
-        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
-    )
-    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
-    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
-
-    batch = Batch(
-        document_names=names,
-        batch_size=len(encoded_stories),
-        src=encoded_stories.to(device),
-        segs=encoder_token_type_ids.to(device),
-        mask_src=encoder_mask.to(device),
-        tgt_str=summaries,
-    )
-
-    return batch
-
-
-def decode_summary(summary_tokens, tokenizer):
-    """ Decode the summary and return it in a format
-    suitable for evaluation.
-    """
-    summary_tokens = summary_tokens.to("cpu").numpy()
-    summary = tokenizer.decode(summary_tokens)
-    sentences = summary.split(".")
-    sentences = [s + "." for s in sentences]
-    return sentences
-
-
-def main():
-    """ The main function defines the interface with the users.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--documents_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The folder where the documents to summarize are located.",
-    )
-    parser.add_argument(
-        "--summaries_output_dir",
-        default=None,
-        type=str,
-        required=False,
-        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
-    )
-    parser.add_argument(
-        "--compute_rouge",
-        default=False,
-        type=bool,
-        required=False,
-        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
-    )
-    # EVALUATION options
-    parser.add_argument(
-        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
-    )
-    parser.add_argument(
-        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
-    )
-    # BEAM SEARCH arguments
-    parser.add_argument(
-        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
-    )
-    parser.add_argument(
-        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
-    )
-    parser.add_argument(
-        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
-    )
-    parser.add_argument(
-        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
-    )
-    parser.add_argument(
-        "--block_trigram",
-        default=True,
-        type=bool,
-        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
-    )
-    args = parser.parse_args()
-
-    # Select device (distibuted not available)
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-
-    # Check the existence of directories
-    if not args.summaries_output_dir:
-        args.summaries_output_dir = args.documents_dir
-
-    if not documents_dir_is_valid(args.documents_dir):
-        raise FileNotFoundError(
-            "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
-        )
-    os.makedirs(args.summaries_output_dir, exist_ok=True)
-
-    evaluate(args)
-
-
-def documents_dir_is_valid(path):
-    if not os.path.exists(path):
-        return False
-
-    file_list = os.listdir(path)
-    if len(file_list) == 0:
-        return False
-
-    return True
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/examples/summarization/test_utils_summarization.py b/server/transformers/examples/summarization/test_utils_summarization.py
deleted file mode 100644
index d562ad04b7be01be4dbc54d71fcbf019ed6929e1..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/test_utils_summarization.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-import torch
-
-from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
-
-
-class SummarizationDataProcessingTest(unittest.TestCase):
-    def setUp(self):
-        self.block_size = 10
-
-    def test_fit_to_block_sequence_too_small(self):
-        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
-        sequence = [1, 2, 3, 4]
-        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
-
-    def test_fit_to_block_sequence_fit_exactly(self):
-        """ Do nothing if the sequence is the right size. """
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
-
-    def test_fit_to_block_sequence_too_big(self):
-        """ Truncate the sequence if it is too long. """
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
-
-    def test_process_story_no_highlights(self):
-        """ Processing a story with no highlights returns an empty list for the summary.
-        """
-        raw_story = """It was the year of Our Lord one thousand seven hundred and
-        seventy-five.\n\nSpiritual revelations were conceded to England at that
-        favoured period, as at this."""
-        _, summary_lines = process_story(raw_story)
-        self.assertEqual(summary_lines, [])
-
-    def test_process_empty_story(self):
-        """ An empty story returns an empty collection of lines.
-        """
-        raw_story = ""
-        story_lines, summary_lines = process_story(raw_story)
-        self.assertEqual(story_lines, [])
-        self.assertEqual(summary_lines, [])
-
-    def test_process_story_with_missing_period(self):
-        raw_story = (
-            "It was the year of Our Lord one thousand seven hundred and "
-            "seventy-five\n\nSpiritual revelations were conceded to England "
-            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
-        )
-        story_lines, summary_lines = process_story(raw_story)
-
-        expected_story_lines = [
-            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
-            "Spiritual revelations were conceded to England at that favoured period, as at this.",
-        ]
-        self.assertEqual(expected_story_lines, story_lines)
-
-        expected_summary_lines = ["It was the best of times."]
-        self.assertEqual(expected_summary_lines, summary_lines)
-
-    def test_build_mask_no_padding(self):
-        sequence = torch.tensor([1, 2, 3, 4])
-        expected = torch.tensor([1, 1, 1, 1])
-        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
-
-    def test_build_mask(self):
-        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
-        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
-
-    def test_build_mask_with_padding_equal_to_one(self):
-        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
-        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
-
-    def test_compute_token_type_ids(self):
-        separator = 101
-        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
-        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
-
-        result = compute_token_type_ids(batch, separator)
-        np.testing.assert_array_equal(result, expected)
diff --git a/server/transformers/examples/summarization/utils_summarization.py b/server/transformers/examples/summarization/utils_summarization.py
deleted file mode 100644
index 529eeb3efa05a323d3177ea60e0055a2f29dfbbb..0000000000000000000000000000000000000000
--- a/server/transformers/examples/summarization/utils_summarization.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import os
-from collections import deque
-
-import torch
-from torch.utils.data import Dataset
-
-
-# ------------
-# Data loading
-# ------------
-
-
-class SummarizationDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
-
-    The class will process the documents that are located in the specified
-    folder. The preprocessing will work on any document that is reasonably
-    formatted. On the CNN/DailyMail dataset it will extract both the story
-    and the summary.
-
-    CNN/Daily News:
-
-    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
-    stored in different files; the summary appears at the end of the story as
-    sentences that are prefixed by the special `@highlight` line. To process
-    the data, untar both datasets in the same folder, and pass the path to this
-    folder as the "data_dir argument. The formatting code was inspired by [2].
-
-    [1] https://cs.nyu.edu/~kcho/
-    [2] https://github.com/abisee/cnn-dailymail/
-    """
-
-    def __init__(self, path="", prefix="train"):
-        """ We initialize the class by listing all the documents to summarize.
-        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
-        """
-        assert os.path.isdir(path)
-
-        self.documents = []
-        story_filenames_list = os.listdir(path)
-        for story_filename in story_filenames_list:
-            if "summary" in story_filename:
-                continue
-            path_to_story = os.path.join(path, story_filename)
-            if not os.path.isfile(path_to_story):
-                continue
-            self.documents.append(path_to_story)
-
-    def __len__(self):
-        """ Returns the number of documents. """
-        return len(self.documents)
-
-    def __getitem__(self, idx):
-        document_path = self.documents[idx]
-        document_name = document_path.split("/")[-1]
-        with open(document_path, encoding="utf-8") as source:
-            raw_story = source.read()
-            story_lines, summary_lines = process_story(raw_story)
-        return document_name, story_lines, summary_lines
-
-
-def process_story(raw_story):
-    """ Extract the story and summary from a story file.
-
-    Attributes:
-        raw_story (str): content of the story file as an utf-8 encoded string.
-
-    Raises:
-        IndexError: If the stoy is empty or contains no highlights.
-    """
-    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
-
-    # for some unknown reason some lines miss a period, add it
-    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
-
-    # gather article lines
-    story_lines = []
-    lines = deque(nonempty_lines)
-    while True:
-        try:
-            element = lines.popleft()
-            if element.startswith("@highlight"):
-                break
-            story_lines.append(element)
-        except IndexError:
-            # if "@highlight" is absent from the file we pop
-            # all elements until there is None, raising an exception.
-            return story_lines, []
-
-    # gather summary lines
-    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
-
-    return story_lines, summary_lines
-
-
-def _add_missing_period(line):
-    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
-    if line.startswith("@highlight"):
-        return line
-    if line[-1] in END_TOKENS:
-        return line
-    return line + "."
-
-
-# --------------------------
-# Encoding and preprocessing
-# --------------------------
-
-
-def fit_to_block_size(sequence, block_size, pad_token_id):
-    """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter we append padding token to the right of the sequence.
-    """
-    if len(sequence) > block_size:
-        return sequence[:block_size]
-    else:
-        sequence.extend([pad_token_id] * (block_size - len(sequence)))
-        return sequence
-
-
-def build_mask(sequence, pad_token_id):
-    """ Builds the mask. The attention mechanism will only attend to positions
-    with value 1. """
-    mask = torch.ones_like(sequence)
-    idx_pad_tokens = sequence == pad_token_id
-    mask[idx_pad_tokens] = 0
-    return mask
-
-
-def encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """ Encode the story and summary lines, and join them
-    as specified in [1] by using `[SEP] [CLS]` tokens to separate
-    sentences.
-    """
-    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
-    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
-    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
-
-    return story_token_ids, summary_token_ids
-
-
-def compute_token_type_ids(batch, separator_token_id):
-    """ Segment embeddings as described in [1]
-
-    The values {0,1} were found in the repository [2].
-
-    Attributes:
-        batch: torch.Tensor, size [batch_size, block_size]
-            Batch of input.
-        separator_token_id: int
-            The value of the token that separates the segments.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
-    """
-    batch_embeddings = []
-    for sequence in batch:
-        sentence_num = -1
-        embeddings = []
-        for s in sequence:
-            if s == separator_token_id:
-                sentence_num += 1
-            embeddings.append(sentence_num % 2)
-        batch_embeddings.append(embeddings)
-    return torch.tensor(batch_embeddings)
diff --git a/server/transformers/examples/test_examples.py b/server/transformers/examples/test_examples.py
deleted file mode 100644
index a31c243dd84a0c5eebb7e6e9f39a77a342fb2ccc..0000000000000000000000000000000000000000
--- a/server/transformers/examples/test_examples.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-import sys
-import unittest
-from unittest.mock import patch
-
-import run_generation
-import run_glue
-import run_squad
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-
-
-def get_setup_file():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f")
-    args = parser.parse_args()
-    return args.f
-
-
-class ExamplesTests(unittest.TestCase):
-    def test_run_glue(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        testargs = [
-            "run_glue.py",
-            "--data_dir=./examples/tests_samples/MRPC/",
-            "--task_name=mrpc",
-            "--do_train",
-            "--do_eval",
-            "--output_dir=./examples/tests_samples/temp_dir",
-            "--per_gpu_train_batch_size=2",
-            "--per_gpu_eval_batch_size=1",
-            "--learning_rate=1e-4",
-            "--max_steps=10",
-            "--warmup_steps=2",
-            "--overwrite_output_dir",
-            "--seed=42",
-        ]
-        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, "argv", testargs + [model_type, model_name]):
-            result = run_glue.main()
-            for value in result.values():
-                self.assertGreaterEqual(value, 0.75)
-
-    def test_run_squad(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        testargs = [
-            "run_squad.py",
-            "--data_dir=./examples/tests_samples/SQUAD",
-            "--model_name=bert-base-uncased",
-            "--output_dir=./examples/tests_samples/temp_dir",
-            "--max_steps=10",
-            "--warmup_steps=2",
-            "--do_train",
-            "--do_eval",
-            "--version_2_with_negative",
-            "--learning_rate=2e-4",
-            "--per_gpu_train_batch_size=2",
-            "--per_gpu_eval_batch_size=1",
-            "--overwrite_output_dir",
-            "--seed=42",
-        ]
-        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, "argv", testargs + [model_type, model_name]):
-            result = run_squad.main()
-            self.assertGreaterEqual(result["f1"], 30)
-            self.assertGreaterEqual(result["exact"], 30)
-
-    def test_generation(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
-        model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
-        with patch.object(sys, "argv", testargs + [model_type, model_name]):
-            result = run_generation.main()
-            self.assertGreaterEqual(len(result), 10)
diff --git a/server/transformers/examples/tests_samples/.gitignore b/server/transformers/examples/tests_samples/.gitignore
deleted file mode 100644
index c8ce21fe2411c3dc3022e26ccf4e11cc6b58a01d..0000000000000000000000000000000000000000
--- a/server/transformers/examples/tests_samples/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*.*
-cache*
-temp*
-!*.tsv
-!*.json
-!.gitignore
\ No newline at end of file
diff --git a/server/transformers/examples/tests_samples/MRPC/dev.tsv b/server/transformers/examples/tests_samples/MRPC/dev.tsv
deleted file mode 100644
index 5b814856c63f44ef8c082726ae19285a4faec26c..0000000000000000000000000000000000000000
--- a/server/transformers/examples/tests_samples/MRPC/dev.tsv
+++ /dev/null
@@ -1,7 +0,0 @@
-﻿Quality	#1 ID	#2 ID	#1 String	#2 String
-1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
-0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
-0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
-1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
-0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
-1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/server/transformers/examples/tests_samples/MRPC/train.tsv b/server/transformers/examples/tests_samples/MRPC/train.tsv
deleted file mode 100644
index 5b814856c63f44ef8c082726ae19285a4faec26c..0000000000000000000000000000000000000000
--- a/server/transformers/examples/tests_samples/MRPC/train.tsv
+++ /dev/null
@@ -1,7 +0,0 @@
-﻿Quality	#1 ID	#2 ID	#1 String	#2 String
-1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
-0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
-0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
-1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
-0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
-1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/server/transformers/examples/tests_samples/SQUAD/dev-v2.0.json b/server/transformers/examples/tests_samples/SQUAD/dev-v2.0.json
deleted file mode 100644
index 834d9ee6602b300ea45c67212800b0bbf6d1129e..0000000000000000000000000000000000000000
--- a/server/transformers/examples/tests_samples/SQUAD/dev-v2.0.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-    "version": "v2.0",
-    "data": [{
-        "title": "Normans",
-        "paragraphs": [{
-            "qas": [{
-                "question": "In what country is Normandy located?",
-                "id": "56ddde6b9a695914005b9628",
-                "answers": [{
-                    "text": "France",
-                    "answer_start": 159
-                }],
-                "is_impossible": false
-            }, {
-                "question": "When were the Normans in Normandy?",
-                "id": "56ddde6b9a695914005b9629",
-                "answers": [{
-                    "text": "10th and 11th centuries",
-                    "answer_start": 94
-                }],
-                "is_impossible": false
-            }, {
-                "question": "From which countries did the Norse originate?",
-                "id": "56ddde6b9a695914005b962a",
-                "answers": [{
-                    "text": "Denmark, Iceland and Norway",
-                    "answer_start": 256
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "Rollo",
-                    "answer_start": 308
-                }],
-                "question": "Who did King Charles III swear fealty to?",
-                "id": "5ad39d53604f3c001a3fe8d3",
-                "answers": [],
-                "is_impossible": true
-            }, {
-                "plausible_answers": [{
-                    "text": "10th century",
-                    "answer_start": 671
-                }],
-                "question": "When did the Frankish identity emerge?",
-                "id": "5ad39d53604f3c001a3fe8d4",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
-        }, {
-            "qas": [{
-                "question": "Who was the duke in the battle of Hastings?",
-                "id": "56dddf4066d3e219004dad5f",
-                "answers": [{
-                    "text": "William the Conqueror",
-                    "answer_start": 1022
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "Antioch",
-                    "answer_start": 1295
-                }],
-                "question": "What principality did William the conquerer found?",
-                "id": "5ad3a266604f3c001a3fea2b",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
-        }]
-    }, {
-        "title": "Computational_complexity_theory",
-        "paragraphs": [{
-            "qas": [{
-                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
-                "id": "56e16182e3433e1400422e28",
-                "answers": [{
-                    "text": "Computational complexity theory",
-                    "answer_start": 0
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "algorithm",
-                    "answer_start": 472
-                }],
-                "question": "What is a manual application of mathematical steps?",
-                "id": "5ad5316b5b96ef001a10ab76",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
-        }, {
-            "qas": [{
-                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
-                "id": "56e16839cd28a01900c67887",
-                "answers": [{
-                    "text": "if its solution requires significant resources",
-                    "answer_start": 46
-                }],
-                "is_impossible": false
-            }, {
-                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
-                "id": "56e16839cd28a01900c67888",
-                "answers": [{
-                    "text": "mathematical models of computation",
-                    "answer_start": 176
-                }],
-                "is_impossible": false
-            }, {
-                "question": "What are two basic primary resources used to guage complexity?",
-                "id": "56e16839cd28a01900c67889",
-                "answers": [{
-                    "text": "time and storage",
-                    "answer_start": 305
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "the number of gates in a circuit",
-                    "answer_start": 436
-                }],
-                "question": "What unit is measured to determine circuit simplicity?",
-                "id": "5ad532575b96ef001a10ab7f",
-                "answers": [],
-                "is_impossible": true
-            }, {
-                "plausible_answers": [{
-                    "text": "the number of processors",
-                    "answer_start": 502
-                }],
-                "question": "What number is used in perpendicular computing?",
-                "id": "5ad532575b96ef001a10ab80",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
-        }]
-    }]
-}
\ No newline at end of file
diff --git a/server/transformers/examples/tests_samples/SQUAD/train-v2.0.json b/server/transformers/examples/tests_samples/SQUAD/train-v2.0.json
deleted file mode 100644
index 834d9ee6602b300ea45c67212800b0bbf6d1129e..0000000000000000000000000000000000000000
--- a/server/transformers/examples/tests_samples/SQUAD/train-v2.0.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-    "version": "v2.0",
-    "data": [{
-        "title": "Normans",
-        "paragraphs": [{
-            "qas": [{
-                "question": "In what country is Normandy located?",
-                "id": "56ddde6b9a695914005b9628",
-                "answers": [{
-                    "text": "France",
-                    "answer_start": 159
-                }],
-                "is_impossible": false
-            }, {
-                "question": "When were the Normans in Normandy?",
-                "id": "56ddde6b9a695914005b9629",
-                "answers": [{
-                    "text": "10th and 11th centuries",
-                    "answer_start": 94
-                }],
-                "is_impossible": false
-            }, {
-                "question": "From which countries did the Norse originate?",
-                "id": "56ddde6b9a695914005b962a",
-                "answers": [{
-                    "text": "Denmark, Iceland and Norway",
-                    "answer_start": 256
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "Rollo",
-                    "answer_start": 308
-                }],
-                "question": "Who did King Charles III swear fealty to?",
-                "id": "5ad39d53604f3c001a3fe8d3",
-                "answers": [],
-                "is_impossible": true
-            }, {
-                "plausible_answers": [{
-                    "text": "10th century",
-                    "answer_start": 671
-                }],
-                "question": "When did the Frankish identity emerge?",
-                "id": "5ad39d53604f3c001a3fe8d4",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
-        }, {
-            "qas": [{
-                "question": "Who was the duke in the battle of Hastings?",
-                "id": "56dddf4066d3e219004dad5f",
-                "answers": [{
-                    "text": "William the Conqueror",
-                    "answer_start": 1022
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "Antioch",
-                    "answer_start": 1295
-                }],
-                "question": "What principality did William the conquerer found?",
-                "id": "5ad3a266604f3c001a3fea2b",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
-        }]
-    }, {
-        "title": "Computational_complexity_theory",
-        "paragraphs": [{
-            "qas": [{
-                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
-                "id": "56e16182e3433e1400422e28",
-                "answers": [{
-                    "text": "Computational complexity theory",
-                    "answer_start": 0
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "algorithm",
-                    "answer_start": 472
-                }],
-                "question": "What is a manual application of mathematical steps?",
-                "id": "5ad5316b5b96ef001a10ab76",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
-        }, {
-            "qas": [{
-                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
-                "id": "56e16839cd28a01900c67887",
-                "answers": [{
-                    "text": "if its solution requires significant resources",
-                    "answer_start": 46
-                }],
-                "is_impossible": false
-            }, {
-                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
-                "id": "56e16839cd28a01900c67888",
-                "answers": [{
-                    "text": "mathematical models of computation",
-                    "answer_start": 176
-                }],
-                "is_impossible": false
-            }, {
-                "question": "What are two basic primary resources used to guage complexity?",
-                "id": "56e16839cd28a01900c67889",
-                "answers": [{
-                    "text": "time and storage",
-                    "answer_start": 305
-                }],
-                "is_impossible": false
-            }, {
-                "plausible_answers": [{
-                    "text": "the number of gates in a circuit",
-                    "answer_start": 436
-                }],
-                "question": "What unit is measured to determine circuit simplicity?",
-                "id": "5ad532575b96ef001a10ab7f",
-                "answers": [],
-                "is_impossible": true
-            }, {
-                "plausible_answers": [{
-                    "text": "the number of processors",
-                    "answer_start": 502
-                }],
-                "question": "What number is used in perpendicular computing?",
-                "id": "5ad532575b96ef001a10ab80",
-                "answers": [],
-                "is_impossible": true
-            }],
-            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
-        }]
-    }]
-}
\ No newline at end of file
diff --git a/server/transformers/examples/utils_multiple_choice.py b/server/transformers/examples/utils_multiple_choice.py
deleted file mode 100644
index 8e19c51414168f91ac6fd5358b6b30048377ed99..0000000000000000000000000000000000000000
--- a/server/transformers/examples/utils_multiple_choice.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
-
-
-import csv
-import glob
-import json
-import logging
-import os
-from typing import List
-
-import tqdm
-
-from transformers import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-    """A single training/test example for multiple choice"""
-
-    def __init__(self, example_id, question, contexts, endings, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            example_id: Unique id for the example.
-            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-            question: string. The untokenized text of the second sequence (question).
-            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.example_id = example_id
-        self.question = question
-        self.contexts = contexts
-        self.endings = endings
-        self.label = label
-
-
-class InputFeatures(object):
-    def __init__(self, example_id, choices_features, label):
-        self.example_id = example_id
-        self.choices_features = [
-            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
-            for input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-class DataProcessor(object):
-    """Base class for data converters for multiple choice data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the test set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-
-class RaceProcessor(DataProcessor):
-    """Processor for the RACE data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
-        high = os.path.join(data_dir, "train/high")
-        middle = os.path.join(data_dir, "train/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
-        high = os.path.join(data_dir, "dev/high")
-        middle = os.path.join(data_dir, "dev/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} test".format(data_dir))
-        high = os.path.join(data_dir, "test/high")
-        middle = os.path.join(data_dir, "test/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_txt(self, input_dir):
-        lines = []
-        files = glob.glob(input_dir + "/*txt")
-        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, "r", encoding="utf-8") as fin:
-                data_raw = json.load(fin)
-                data_raw["race_id"] = file
-                lines.append(data_raw)
-        return lines
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (_, data_raw) in enumerate(lines):
-            race_id = "%s-%s" % (set_type, data_raw["race_id"])
-            article = data_raw["article"]
-            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw["answers"][i]) - ord("A"))
-                question = data_raw["questions"][i]
-                options = data_raw["options"][i]
-
-                examples.append(
-                    InputExample(
-                        example_id=race_id,
-                        question=question,
-                        contexts=[article, article, article, article],  # this is not efficient but convenient
-                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth,
-                    )
-                )
-        return examples
-
-
-class SwagProcessor(DataProcessor):
-    """Processor for the SWAG data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
-        raise ValueError(
-            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
-            "setting!"
-        )
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: List[List[str]], type: str):
-        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != "label":
-            raise ValueError("For training, the input file must contain a label column.")
-
-        examples = [
-            InputExample(
-                example_id=line[2],
-                question=line[5],  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[4], line[4], line[4], line[4]],
-                endings=[line[7], line[8], line[9], line[10]],
-                label=line[11],
-            )
-            for line in lines[1:]  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class ArcProcessor(DataProcessor):
-    """Processor for the ARC data set (request from allennlp)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
-        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
-        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
-
-    def get_test_examples(self, data_dir):
-        logger.info("LOOKING AT {} test".format(data_dir))
-        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_json(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as fin:
-            lines = fin.readlines()
-            return lines
-
-    def _create_examples(self, lines, type):
-        """Creates examples for the training and dev sets."""
-
-        # There are two types of labels. They should be normalized
-        def normalize(truth):
-            if truth in "ABCD":
-                return ord(truth) - ord("A")
-            elif truth in "1234":
-                return int(truth) - 1
-            else:
-                logger.info("truth ERROR! %s", str(truth))
-                return None
-
-        examples = []
-        three_choice = 0
-        four_choice = 0
-        five_choice = 0
-        other_choices = 0
-        # we deleted example which has more than or less than four choices
-        for line in tqdm.tqdm(lines, desc="read arc data"):
-            data_raw = json.loads(line.strip("\n"))
-            if len(data_raw["question"]["choices"]) == 3:
-                three_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) == 5:
-                five_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) != 4:
-                other_choices += 1
-                continue
-            four_choice += 1
-            truth = str(normalize(data_raw["answerKey"]))
-            assert truth != "None"
-            question_choices = data_raw["question"]
-            question = question_choices["stem"]
-            id = data_raw["id"]
-            options = question_choices["choices"]
-            if len(options) == 4:
-                examples.append(
-                    InputExample(
-                        example_id=id,
-                        question=question,
-                        contexts=[
-                            options[0]["para"].replace("_", ""),
-                            options[1]["para"].replace("_", ""),
-                            options[2]["para"].replace("_", ""),
-                            options[3]["para"].replace("_", ""),
-                        ],
-                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth,
-                    )
-                )
-
-        if type == "train":
-            assert len(examples) > 1
-            assert examples[0].label is not None
-        logger.info("len examples: %s}", str(len(examples)))
-        logger.info("Three choices: %s", str(three_choice))
-        logger.info("Five choices: %s", str(five_choice))
-        logger.info("Other choices: %s", str(other_choices))
-        logger.info("four choices: %s", str(four_choice))
-
-        return examples
-
-
-def convert_examples_to_features(
-    examples: List[InputExample],
-    label_list: List[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
-    pad_token_segment_id=0,
-    pad_on_left=False,
-    pad_token=0,
-    mask_padding_with_zero=True,
-) -> List[InputFeatures]:
-    """
-    Loads a data file into a list of `InputFeatures`
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        choices_features = []
-        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            text_a = context
-            if example.question.find("_") != -1:
-                # this is for cloze question
-                text_b = example.question.replace("_", ending)
-            else:
-                text_b = example.question + " " + ending
-
-            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
-            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
-                logger.info(
-                    "Attention! you are cropping tokens (swag task is ok). "
-                    "If you are training ARC and RACE and you are poping question + options,"
-                    "you need to try to use a bigger max seq length!"
-                )
-
-            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = max_length - len(input_ids)
-            if pad_on_left:
-                input_ids = ([pad_token] * padding_length) + input_ids
-                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-            else:
-                input_ids = input_ids + ([pad_token] * padding_length)
-                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-            assert len(input_ids) == max_length
-            assert len(attention_mask) == max_length
-            assert len(token_type_ids) == max_length
-            choices_features.append((input_ids, attention_mask, token_type_ids))
-
-        label = label_map[example.label]
-
-        if ex_index < 2:
-            logger.info("*** Example ***")
-            logger.info("race_id: {}".format(example.example_id))
-            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
-                logger.info("label: {}".format(label))
-
-        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
-
-    return features
-
-
-processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
diff --git a/server/transformers/examples/utils_ner.py b/server/transformers/examples/utils_ner.py
deleted file mode 100644
index 510749c2f59c3e734dd5d07b3b2ad00cf4789849..0000000000000000000000000000000000000000
--- a/server/transformers/examples/utils_ner.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
-
-
-import logging
-import os
-
-
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-    """A single training/test example for token classification."""
-
-    def __init__(self, guid, words, labels):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            words: list. The words of the sequence.
-            labels: (Optional) list. The labels for each word of the sequence. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.words = words
-        self.labels = labels
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_ids = label_ids
-
-
-def read_examples_from_file(data_dir, mode):
-    file_path = os.path.join(data_dir, "{}.txt".format(mode))
-    guid_index = 1
-    examples = []
-    with open(file_path, encoding="utf-8") as f:
-        words = []
-        labels = []
-        for line in f:
-            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
-                    guid_index += 1
-                    words = []
-                    labels = []
-            else:
-                splits = line.split(" ")
-                words.append(splits[0])
-                if len(splits) > 1:
-                    labels.append(splits[-1].replace("\n", ""))
-                else:
-                    # Examples could have no label for mode = "test"
-                    labels.append("O")
-        if words:
-            examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
-    return examples
-
-
-def convert_examples_to_features(
-    examples,
-    label_list,
-    max_seq_length,
-    tokenizer,
-    cls_token_at_end=False,
-    cls_token="[CLS]",
-    cls_token_segment_id=1,
-    sep_token="[SEP]",
-    sep_token_extra=False,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    pad_token_label_id=-100,
-    sequence_a_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d", ex_index, len(examples))
-
-        tokens = []
-        label_ids = []
-        for word, label in zip(example.words, example.labels):
-            word_tokens = tokenizer.tokenize(word)
-            tokens.extend(word_tokens)
-            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
-
-        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-        special_tokens_count = 3 if sep_token_extra else 2
-        if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[: (max_seq_length - special_tokens_count)]
-            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens += [sep_token]
-        label_ids += [pad_token_label_id]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-            label_ids += [pad_token_label_id]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if cls_token_at_end:
-            tokens += [cls_token]
-            label_ids += [pad_token_label_id]
-            segment_ids += [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            label_ids = [pad_token_label_id] + label_ids
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            label_ids = ([pad_token_label_id] * padding_length) + label_ids
-        else:
-            input_ids += [pad_token] * padding_length
-            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
-            segment_ids += [pad_token_segment_id] * padding_length
-            label_ids += [pad_token_label_id] * padding_length
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        assert len(label_ids) == max_seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s", example.guid)
-            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
-
-        features.append(
-            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
-        )
-    return features
-
-
-def get_labels(path):
-    if path:
-        with open(path, "r") as f:
-            labels = f.read().splitlines()
-        if "O" not in labels:
-            labels = ["O"] + labels
-        return labels
-    else:
-        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
diff --git a/server/transformers/hubconf.py b/server/transformers/hubconf.py
deleted file mode 100644
index 4e5c1b4b01d3f4b93a58f3f3a66b297b516c1205..0000000000000000000000000000000000000000
--- a/server/transformers/hubconf.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForQuestionAnswering,
-    AutoModelForSequenceClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-)
-from transformers.file_utils import add_start_docstrings
-
-
-dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
-
-
-@add_start_docstrings(AutoConfig.__doc__)
-def config(*args, **kwargs):
-    r"""
-                # Using torch.hub !
-                import torch
-
-                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
-                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
-                assert config.output_attention == True
-                assert unused_kwargs == {'foo': False}
-
-            """
-
-    return AutoConfig.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoTokenizer.__doc__)
-def tokenizer(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
-
-    """
-
-    return AutoTokenizer.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModel.__doc__)
-def model(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModel.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelWithLMHead.__doc__)
-def modelWithLMHead(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
-def modelForSequenceClassification(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
-def modelForQuestionAnswering(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/server/transformers/model_cards/KB/albert-base-swedish-cased-alpha/README.md b/server/transformers/model_cards/KB/albert-base-swedish-cased-alpha/README.md
deleted file mode 100644
index aa6ae466a44ad83dc94f902c8f75c33f32eab03d..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/KB/albert-base-swedish-cased-alpha/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Swedish BERT Models
-
-The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
-
-The following three models are currently available:
-
-- **bert-base-swedish-cased** (*v1*) - A BERT trained with the same hyperparameters as first published by Google.
-- **bert-base-swedish-cased-ner** (*experimental*) - a BERT fine-tuned for NER using SUC 3.0.
-- **albert-base-swedish-cased-alpha** (*alpha*) - A first attempt at an ALBERT for Swedish.
-
-All models are cased and trained with whole word masking.
-
-## Files
-
-| **name**                        | **files** |
-|---------------------------------|-----------|
-| bert-base-swedish-cased         | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/vocab.txt), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/pytorch_model.bin) |
-| bert-base-swedish-cased-ner     | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/vocab.txt) [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/pytorch_model.bin) |
-| albert-base-swedish-cased-alpha | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/config.json), [sentencepiece model](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/spiece.model), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/pytorch_model.bin) |
-
-TensorFlow model weights will be released soon.
-
-## Usage requirements / installation instructions
-
-The examples below require Huggingface Transformers 2.4.1 and Pytorch 1.3.1 or greater. For Transformers<2.4.0 the tokenizer must be instantiated manually and the `do_lower_case` flag parameter set to `False` and `keep_accents` to `True` (for ALBERT).
-
-To create an environment where the examples can be run, run the following in an terminal on your OS of choice.
-
-```
-# git clone https://github.com/Kungbib/swedish-bert-models
-# cd swedish-bert-models
-# python3 -m venv venv
-# source venv/bin/activate
-# pip install --upgrade pip
-# pip install -r requirements.txt
-```
-
-### BERT Base Swedish
-
-A standard BERT base for Swedish trained on a variety of sources. Vocabulary size is ~50k. Using Huggingface Transformers the model can be loaded in Python as follows:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
-model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
-```
-
-
-### BERT base fine-tuned for Swedish NER
-
-This model is fine-tuned on the SUC 3.0 dataset. Using the Huggingface pipeline the model can be easily instantiated. For Transformer<2.4.1 it seems the tokenizer must be loaded separately to disable lower-casing of input strings:
-
-```python
-from transformers import pipeline
-
-nlp = pipeline('ner', model='KB/bert-base-swedish-cased-ner', tokenizer='KB/bert-base-swedish-cased-ner')
-
-nlp('Idag släpper KB tre språkmodeller.')
-```
-
-Running the Python code above should produce in something like the result below. Entity types used are `TME` for time, `PRS` for personal names, `LOC` for locations, `EVN` for events and `ORG` for organisations. These labels are subject to change.
-
-```python
-[ { 'word': 'Idag', 'score': 0.9998126029968262, 'entity': 'TME' },
-  { 'word': 'KB',   'score': 0.9814832210540771, 'entity': 'ORG' } ]
-```
-
-The BERT tokenizer often splits words into multiple tokens, with the subparts starting with `##`, for example the string `Engelbert kör Volvo till Herrängens fotbollsklubb` gets tokenized as `Engel ##bert kör Volvo till Herr ##ängens fotbolls ##klubb`. To glue parts back together one can use something like this:
-
-```python
-text = 'Engelbert tar Volvon till Tele2 Arena för att titta på Djurgården IF ' +\
-       'som spelar fotboll i VM klockan två på kvällen.'
-
-l = []
-for token in nlp(text):
-    if token['word'].startswith('##'):
-        l[-1]['word'] += token['word'][2:]
-    else:
-        l += [ token ]
-
-print(l)
-```
-
-Which should result in the following (though less cleanly formated):
-
-```python
-[ { 'word': 'Engelbert',     'score': 0.99..., 'entity': 'PRS'},
-  { 'word': 'Volvon',        'score': 0.99..., 'entity': 'OBJ'},
-  { 'word': 'Tele2',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Arena',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Djurgården',    'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'IF',            'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'VM',            'score': 0.99..., 'entity': 'EVN'},
-  { 'word': 'klockan',       'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'två',           'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'på',            'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'kvällen',       'score': 0.54..., 'entity': 'TME'} ]
-```
-
-### ALBERT base
-
-The easisest way to do this is, again, using Huggingface Transformers:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/albert-base-swedish-cased-alpha'),
-model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
-```
-
-## Acknowledgements ❤️
-
-- Resources from Stockholms University, Umeå University and Swedish Language Bank at Gothenburg University were used when fine-tuning BERT for NER.
-- Model pretraining was made partly in-house at the KBLab and partly (for material without active copyright) with the support of Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-- Models are hosted on S3 by Huggingface 🤗
-
diff --git a/server/transformers/model_cards/KB/bert-base-swedish-cased-ner/README.md b/server/transformers/model_cards/KB/bert-base-swedish-cased-ner/README.md
deleted file mode 100644
index aa6ae466a44ad83dc94f902c8f75c33f32eab03d..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/KB/bert-base-swedish-cased-ner/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Swedish BERT Models
-
-The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
-
-The following three models are currently available:
-
-- **bert-base-swedish-cased** (*v1*) - A BERT trained with the same hyperparameters as first published by Google.
-- **bert-base-swedish-cased-ner** (*experimental*) - a BERT fine-tuned for NER using SUC 3.0.
-- **albert-base-swedish-cased-alpha** (*alpha*) - A first attempt at an ALBERT for Swedish.
-
-All models are cased and trained with whole word masking.
-
-## Files
-
-| **name**                        | **files** |
-|---------------------------------|-----------|
-| bert-base-swedish-cased         | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/vocab.txt), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/pytorch_model.bin) |
-| bert-base-swedish-cased-ner     | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/vocab.txt) [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/pytorch_model.bin) |
-| albert-base-swedish-cased-alpha | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/config.json), [sentencepiece model](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/spiece.model), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/pytorch_model.bin) |
-
-TensorFlow model weights will be released soon.
-
-## Usage requirements / installation instructions
-
-The examples below require Huggingface Transformers 2.4.1 and Pytorch 1.3.1 or greater. For Transformers<2.4.0 the tokenizer must be instantiated manually and the `do_lower_case` flag parameter set to `False` and `keep_accents` to `True` (for ALBERT).
-
-To create an environment where the examples can be run, run the following in an terminal on your OS of choice.
-
-```
-# git clone https://github.com/Kungbib/swedish-bert-models
-# cd swedish-bert-models
-# python3 -m venv venv
-# source venv/bin/activate
-# pip install --upgrade pip
-# pip install -r requirements.txt
-```
-
-### BERT Base Swedish
-
-A standard BERT base for Swedish trained on a variety of sources. Vocabulary size is ~50k. Using Huggingface Transformers the model can be loaded in Python as follows:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
-model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
-```
-
-
-### BERT base fine-tuned for Swedish NER
-
-This model is fine-tuned on the SUC 3.0 dataset. Using the Huggingface pipeline the model can be easily instantiated. For Transformer<2.4.1 it seems the tokenizer must be loaded separately to disable lower-casing of input strings:
-
-```python
-from transformers import pipeline
-
-nlp = pipeline('ner', model='KB/bert-base-swedish-cased-ner', tokenizer='KB/bert-base-swedish-cased-ner')
-
-nlp('Idag släpper KB tre språkmodeller.')
-```
-
-Running the Python code above should produce in something like the result below. Entity types used are `TME` for time, `PRS` for personal names, `LOC` for locations, `EVN` for events and `ORG` for organisations. These labels are subject to change.
-
-```python
-[ { 'word': 'Idag', 'score': 0.9998126029968262, 'entity': 'TME' },
-  { 'word': 'KB',   'score': 0.9814832210540771, 'entity': 'ORG' } ]
-```
-
-The BERT tokenizer often splits words into multiple tokens, with the subparts starting with `##`, for example the string `Engelbert kör Volvo till Herrängens fotbollsklubb` gets tokenized as `Engel ##bert kör Volvo till Herr ##ängens fotbolls ##klubb`. To glue parts back together one can use something like this:
-
-```python
-text = 'Engelbert tar Volvon till Tele2 Arena för att titta på Djurgården IF ' +\
-       'som spelar fotboll i VM klockan två på kvällen.'
-
-l = []
-for token in nlp(text):
-    if token['word'].startswith('##'):
-        l[-1]['word'] += token['word'][2:]
-    else:
-        l += [ token ]
-
-print(l)
-```
-
-Which should result in the following (though less cleanly formated):
-
-```python
-[ { 'word': 'Engelbert',     'score': 0.99..., 'entity': 'PRS'},
-  { 'word': 'Volvon',        'score': 0.99..., 'entity': 'OBJ'},
-  { 'word': 'Tele2',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Arena',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Djurgården',    'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'IF',            'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'VM',            'score': 0.99..., 'entity': 'EVN'},
-  { 'word': 'klockan',       'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'två',           'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'på',            'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'kvällen',       'score': 0.54..., 'entity': 'TME'} ]
-```
-
-### ALBERT base
-
-The easisest way to do this is, again, using Huggingface Transformers:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/albert-base-swedish-cased-alpha'),
-model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
-```
-
-## Acknowledgements ❤️
-
-- Resources from Stockholms University, Umeå University and Swedish Language Bank at Gothenburg University were used when fine-tuning BERT for NER.
-- Model pretraining was made partly in-house at the KBLab and partly (for material without active copyright) with the support of Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-- Models are hosted on S3 by Huggingface 🤗
-
diff --git a/server/transformers/model_cards/KB/bert-base-swedish-cased/README.md b/server/transformers/model_cards/KB/bert-base-swedish-cased/README.md
deleted file mode 100644
index aa6ae466a44ad83dc94f902c8f75c33f32eab03d..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/KB/bert-base-swedish-cased/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Swedish BERT Models
-
-The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
-
-The following three models are currently available:
-
-- **bert-base-swedish-cased** (*v1*) - A BERT trained with the same hyperparameters as first published by Google.
-- **bert-base-swedish-cased-ner** (*experimental*) - a BERT fine-tuned for NER using SUC 3.0.
-- **albert-base-swedish-cased-alpha** (*alpha*) - A first attempt at an ALBERT for Swedish.
-
-All models are cased and trained with whole word masking.
-
-## Files
-
-| **name**                        | **files** |
-|---------------------------------|-----------|
-| bert-base-swedish-cased         | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/vocab.txt), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased/pytorch_model.bin) |
-| bert-base-swedish-cased-ner     | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/config.json), [vocab](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/vocab.txt) [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/bert-base-swedish-cased-ner/pytorch_model.bin) |
-| albert-base-swedish-cased-alpha | [config](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/config.json), [sentencepiece model](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/spiece.model), [pytorch_model.bin](https://s3.amazonaws.com/models.huggingface.co/bert/KB/albert-base-swedish-cased-alpha/pytorch_model.bin) |
-
-TensorFlow model weights will be released soon.
-
-## Usage requirements / installation instructions
-
-The examples below require Huggingface Transformers 2.4.1 and Pytorch 1.3.1 or greater. For Transformers<2.4.0 the tokenizer must be instantiated manually and the `do_lower_case` flag parameter set to `False` and `keep_accents` to `True` (for ALBERT).
-
-To create an environment where the examples can be run, run the following in an terminal on your OS of choice.
-
-```
-# git clone https://github.com/Kungbib/swedish-bert-models
-# cd swedish-bert-models
-# python3 -m venv venv
-# source venv/bin/activate
-# pip install --upgrade pip
-# pip install -r requirements.txt
-```
-
-### BERT Base Swedish
-
-A standard BERT base for Swedish trained on a variety of sources. Vocabulary size is ~50k. Using Huggingface Transformers the model can be loaded in Python as follows:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
-model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
-```
-
-
-### BERT base fine-tuned for Swedish NER
-
-This model is fine-tuned on the SUC 3.0 dataset. Using the Huggingface pipeline the model can be easily instantiated. For Transformer<2.4.1 it seems the tokenizer must be loaded separately to disable lower-casing of input strings:
-
-```python
-from transformers import pipeline
-
-nlp = pipeline('ner', model='KB/bert-base-swedish-cased-ner', tokenizer='KB/bert-base-swedish-cased-ner')
-
-nlp('Idag släpper KB tre språkmodeller.')
-```
-
-Running the Python code above should produce in something like the result below. Entity types used are `TME` for time, `PRS` for personal names, `LOC` for locations, `EVN` for events and `ORG` for organisations. These labels are subject to change.
-
-```python
-[ { 'word': 'Idag', 'score': 0.9998126029968262, 'entity': 'TME' },
-  { 'word': 'KB',   'score': 0.9814832210540771, 'entity': 'ORG' } ]
-```
-
-The BERT tokenizer often splits words into multiple tokens, with the subparts starting with `##`, for example the string `Engelbert kör Volvo till Herrängens fotbollsklubb` gets tokenized as `Engel ##bert kör Volvo till Herr ##ängens fotbolls ##klubb`. To glue parts back together one can use something like this:
-
-```python
-text = 'Engelbert tar Volvon till Tele2 Arena för att titta på Djurgården IF ' +\
-       'som spelar fotboll i VM klockan två på kvällen.'
-
-l = []
-for token in nlp(text):
-    if token['word'].startswith('##'):
-        l[-1]['word'] += token['word'][2:]
-    else:
-        l += [ token ]
-
-print(l)
-```
-
-Which should result in the following (though less cleanly formated):
-
-```python
-[ { 'word': 'Engelbert',     'score': 0.99..., 'entity': 'PRS'},
-  { 'word': 'Volvon',        'score': 0.99..., 'entity': 'OBJ'},
-  { 'word': 'Tele2',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Arena',         'score': 0.99..., 'entity': 'LOC'},
-  { 'word': 'Djurgården',    'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'IF',            'score': 0.99..., 'entity': 'ORG'},
-  { 'word': 'VM',            'score': 0.99..., 'entity': 'EVN'},
-  { 'word': 'klockan',       'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'två',           'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'på',            'score': 0.99..., 'entity': 'TME'},
-  { 'word': 'kvällen',       'score': 0.54..., 'entity': 'TME'} ]
-```
-
-### ALBERT base
-
-The easisest way to do this is, again, using Huggingface Transformers:
-
-```python
-from transformers import AutoModel,AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained('KB/albert-base-swedish-cased-alpha'),
-model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
-```
-
-## Acknowledgements ❤️
-
-- Resources from Stockholms University, Umeå University and Swedish Language Bank at Gothenburg University were used when fine-tuning BERT for NER.
-- Model pretraining was made partly in-house at the KBLab and partly (for material without active copyright) with the support of Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-- Models are hosted on S3 by Huggingface 🤗
-
diff --git a/server/transformers/model_cards/Musixmatch/umberto-commoncrawl-cased-v1/README.md b/server/transformers/model_cards/Musixmatch/umberto-commoncrawl-cased-v1/README.md
deleted file mode 100644
index aacd4d9e3cae5df88d4afe929d728cd38885c1c1..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/Musixmatch/umberto-commoncrawl-cased-v1/README.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# UmBERTo Commoncrawl Cased
-
-[UmBERTo](https://github.com/musixmatchresearch/umberto) is a Roberta-based Language Model trained on large Italian Corpora and uses two innovative approaches: SentencePiece and Whole Word Masking. Now available at [github.com/huggingface/transformers](https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1)
-
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/7140210/72913702-d55a8480-3d3d-11ea-99fc-f2ef29af4e72.jpg" width="700"> </br>
-    Marco Lodola, Monument to Umberto Eco, Alessandria 2019
-</p>
-
-## Dataset
-UmBERTo-Commoncrawl-Cased utilizes the Italian subcorpus of [OSCAR](https://traces1.inria.fr/oscar/) as training set of the language model. We used deduplicated version of the Italian corpus that consists in 70 GB of plain text data, 210M sentences with 11B words where the sentences have been filtered and shuffled at line level in order to be used for NLP research.
-
-## Pre-trained model
-
-| Model | WWM | Cased | Tokenizer | Vocab Size  | Train Steps |  Download |
-| ------ | ------ | ------ | ------ | ------ |------ | ------ |
-| `umberto-commoncrawl-cased-v1` | YES | YES | SPM | 32K | 125k | [Link](http://bit.ly/35zO7GH) |
-
-This model was trained with [SentencePiece](https://github.com/google/sentencepiece) and Whole Word Masking.
-
-## Downstream Tasks
-These results refers to umberto-commoncrawl-cased model. All details are at [Umberto](https://github.com/musixmatchresearch/umberto) Official Page.
-
-#### Named Entity Recognition (NER)
-
-| Dataset | F1 | Precision | Recall | Accuracy |
-| ------ | ------ | ------ |  ------ |  ------ |
-| **ICAB-EvalITA07** | **87.565**  | 86.596  | 88.556  | 98.690 | 
-| **WikiNER-ITA** | **92.531**  | 92.509 | 92.553 | 99.136 | 
-
-#### Part of Speech (POS)
-
-| Dataset | F1 | Precision | Recall | Accuracy |
-| ------ | ------ | ------ |  ------ |  ------ |
-| **UD_Italian-ISDT** | 98.870  | 98.861 | 98.879 | **98.977** | 
-| **UD_Italian-ParTUT** | 98.786 | 98.812 |  98.760 | **98.903** | 
-
-
-
-## Usage
-
-##### Load UmBERTo with AutoModel, Autotokenizer:
-
-```python
-
-import torch
-from transformers import AutoTokenizer, AutoModel
-
-tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
-umberto = AutoModel.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
-
-encoded_input = tokenizer.encode("Umberto Eco è stato un grande scrittore")
-input_ids = torch.tensor(encoded_input).unsqueeze(0)  # Batch size 1
-outputs = umberto(input_ids)
-last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output
-```
-
-##### Predict masked token:
-
-```python
-from transformers import pipeline
-
-fill_mask = pipeline(
-	"fill-mask",
-	model="Musixmatch/umberto-commoncrawl-cased-v1",
-	tokenizer="Musixmatch/umberto-commoncrawl-cased-v1"
-)
-
-result = fill_mask("Umberto Eco è <mask> un grande scrittore")
-# {'sequence': '<s> Umberto Eco è considerato un grande scrittore</s>', 'score': 0.18599839508533478, 'token': 5032}
-# {'sequence': '<s> Umberto Eco è stato un grande scrittore</s>', 'score': 0.17816807329654694, 'token': 471}
-# {'sequence': '<s> Umberto Eco è sicuramente un grande scrittore</s>', 'score': 0.16565583646297455, 'token': 2654}
-# {'sequence': '<s> Umberto Eco è indubbiamente un grande scrittore</s>', 'score': 0.0932890921831131, 'token': 17908}
-# {'sequence': '<s> Umberto Eco è certamente un grande scrittore</s>', 'score': 0.054701317101716995, 'token': 5269}
-```
-
-
-## Citation
-All of the original datasets are publicly available or were released with the owners' grant. The datasets are all released under a CC0 or CCBY license.
-
-* UD Italian-ISDT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ISDT)
-* UD Italian-ParTUT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ParTUT)
-* I-CAB (Italian Content Annotation Bank), EvalITA [Page](http://www.evalita.it/)
-* WIKINER [Page](https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500) , [Paper](https://www.sciencedirect.com/science/article/pii/S0004370212000276?via%3Dihub)
-
-```
-@inproceedings {magnini2006annotazione,
-	title = {Annotazione di contenuti concettuali in un corpus italiano: I - CAB},
-	author = {Magnini,Bernardo and Cappelli,Amedeo and Pianta,Emanuele and Speranza,Manuela and Bartalesi Lenzi,V and Sprugnoli,Rachele and Romano,Lorenza and Girardi,Christian and Negri,Matteo},
-	booktitle = {Proc.of SILFI 2006},
-	year = {2006}
-}
-@inproceedings {magnini2006cab,
-	title = {I - CAB: the Italian Content Annotation Bank.},
-	author = {Magnini,Bernardo and Pianta,Emanuele and Girardi,Christian and Negri,Matteo and Romano,Lorenza and Speranza,Manuela and Lenzi,Valentina Bartalesi and Sprugnoli,Rachele},
-	booktitle = {LREC},
-	pages = {963--968},
-	year = {2006},
-	organization = {Citeseer}
-}
-```
-
-## Authors
-
-**Loreto Parisi**: `loreto at musixmatch dot com`, [loretoparisi](https://github.com/loretoparisi)<br>
-**Simone Francia**: `simone.francia at musixmatch dot com`, [simonefrancia](https://github.com/simonefrancia)<br>
-**Paolo Magnani**: `paul.magnani95 at gmail dot com`, [paulthemagno](https://github.com/paulthemagno)<br>
-
-## About Musixmatch AI
-![Musxmatch Ai mac app icon-128](https://user-images.githubusercontent.com/163333/72244273-396aa380-35ee-11ea-894b-4ea48230c02b.png)<br>
-We do Machine Learning and Artificial Intelligence @[musixmatch](https://twitter.com/Musixmatch)<br>
-Follow us on [Twitter](https://twitter.com/musixmatchai) [Github](https://github.com/musixmatchresearch)
-
-
diff --git a/server/transformers/model_cards/Musixmatch/umberto-wikipedia-uncased-v1/README.md b/server/transformers/model_cards/Musixmatch/umberto-wikipedia-uncased-v1/README.md
deleted file mode 100644
index fd94e5e13daaa931940fe1a15c7d3e5b96d40f8a..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/Musixmatch/umberto-wikipedia-uncased-v1/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# UmBERTo Wikipedia Uncased
-
-[UmBERTo](https://github.com/musixmatchresearch/umberto) is a Roberta-based Language Model trained on large Italian Corpora and uses two innovative approaches: SentencePiece and Whole Word Masking. Now available at [github.com/huggingface/transformers](https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1)
-
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/7140210/72913702-d55a8480-3d3d-11ea-99fc-f2ef29af4e72.jpg" width="700"> </br>
-    Marco Lodola, Monument to Umberto Eco, Alessandria 2019
-</p>
-
-## Dataset
-UmBERTo-Wikipedia-Uncased Training is trained on a relative small corpus (~7GB) extracted from [Wikipedia-ITA](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/).
-
-## Pre-trained model
-
-| Model | WWM | Cased | Tokenizer | Vocab Size  | Train Steps |  Download |
-| ------ | ------ | ------ | ------ | ------ |------ | ------ |
-| `umberto-wikipedia-uncased-v1` | YES | YES | SPM | 32K | 100k | [Link](http://bit.ly/35wbSj6) |
-
-This model was trained with [SentencePiece](https://github.com/google/sentencepiece) and Whole Word Masking.
-
-## Downstream Tasks
-These results refers to umberto-wikipedia-uncased model. All details are at [Umberto](https://github.com/musixmatchresearch/umberto) Official Page.
-
-#### Named Entity Recognition (NER)
-
-| Dataset | F1 | Precision | Recall | Accuracy |
-| ------ | ------ | ------ |  ------ |  ----- |
-| **ICAB-EvalITA07** | **86.240** | 85.939 | 86.544 | 98.534 | 
-| **WikiNER-ITA** | **90.483** | 90.328 | 90.638 | 98.661 | 
-
-#### Part of Speech (POS)
-
-| Dataset | F1 | Precision | Recall | Accuracy |
-| ------ | ------ | ------ |  ------ |  ------ |
-| **UD_Italian-ISDT** | 98.563  | 98.508 | 98.618 | **98.717** | 
-| **UD_Italian-ParTUT** | 97.810 | 97.835 |  97.784 | **98.060** | 
-
-
-
-## Usage
-
-##### Load UmBERTo Wikipedia Uncased with AutoModel, Autotokenizer:
-
-```python
-
-import torch
-from transformers import AutoTokenizer, AutoModel
-
-tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1")
-umberto = AutoModel.from_pretrained("Musixmatch/umberto-wikipedia-uncased-v1")
-
-encoded_input = tokenizer.encode("Umberto Eco è stato un grande scrittore")
-input_ids = torch.tensor(encoded_input).unsqueeze(0)  # Batch size 1
-outputs = umberto(input_ids)
-last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output
-```
-
-##### Predict masked token:
-
-```python
-from transformers import pipeline
-
-fill_mask = pipeline(
-	"fill-mask",
-	model="Musixmatch/umberto-wikipedia-uncased-v1",
-	tokenizer="Musixmatch/umberto-wikipedia-uncased-v1"
-)
-
-result = fill_mask("Umberto Eco è <mask> un grande scrittore")
-# {'sequence': '<s> umberto eco è stato un grande scrittore</s>', 'score': 0.5784581303596497, 'token': 361}
-# {'sequence': '<s> umberto eco è anche un grande scrittore</s>', 'score': 0.33813193440437317, 'token': 269}
-# {'sequence': '<s> umberto eco è considerato un grande scrittore</s>', 'score': 0.027196012437343597, 'token': 3236}
-# {'sequence': '<s> umberto eco è diventato un grande scrittore</s>', 'score': 0.013716378249228, 'token': 5742}
-# {'sequence': '<s> umberto eco è inoltre un grande scrittore</s>', 'score': 0.010662357322871685, 'token': 1030}
-```
-
-
-## Citation
-All of the original datasets are publicly available or were released with the owners' grant. The datasets are all released under a CC0 or CCBY license.
-
-* UD Italian-ISDT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ISDT)
-* UD Italian-ParTUT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ParTUT)
-* I-CAB (Italian Content Annotation Bank), EvalITA [Page](http://www.evalita.it/)
-* WIKINER [Page](https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500) , [Paper](https://www.sciencedirect.com/science/article/pii/S0004370212000276?via%3Dihub)
-
-```
-@inproceedings {magnini2006annotazione,
-	title = {Annotazione di contenuti concettuali in un corpus italiano: I - CAB},
-	author = {Magnini,Bernardo and Cappelli,Amedeo and Pianta,Emanuele and Speranza,Manuela and Bartalesi Lenzi,V and Sprugnoli,Rachele and Romano,Lorenza and Girardi,Christian and Negri,Matteo},
-	booktitle = {Proc.of SILFI 2006},
-	year = {2006}
-}
-@inproceedings {magnini2006cab,
-	title = {I - CAB: the Italian Content Annotation Bank.},
-	author = {Magnini,Bernardo and Pianta,Emanuele and Girardi,Christian and Negri,Matteo and Romano,Lorenza and Speranza,Manuela and Lenzi,Valentina Bartalesi and Sprugnoli,Rachele},
-	booktitle = {LREC},
-	pages = {963--968},
-	year = {2006},
-	organization = {Citeseer}
-}
-```
-
-## Authors
-
-**Loreto Parisi**: `loreto at musixmatch dot com`, [loretoparisi](https://github.com/loretoparisi)<br>
-**Simone Francia**: `simone.francia at musixmatch dot com`, [simonefrancia](https://github.com/simonefrancia)<br>
-**Paolo Magnani**: `paul.magnani95 at gmail dot com`, [paulthemagno](https://github.com/paulthemagno)<br>
-
-## About Musixmatch AI
-![Musxmatch Ai mac app icon-128](https://user-images.githubusercontent.com/163333/72244273-396aa380-35ee-11ea-894b-4ea48230c02b.png)<br>
-We do Machine Learning and Artificial Intelligence @[musixmatch](https://twitter.com/Musixmatch)<br>
-Follow us on [Twitter](https://twitter.com/musixmatchai) [Github](https://github.com/musixmatchresearch)
-
diff --git a/server/transformers/model_cards/dbmdz/bert-base-german-cased/README.md b/server/transformers/model_cards/dbmdz/bert-base-german-cased/README.md
deleted file mode 100644
index fccd05054577d6633cf6c6ed2193e875b0a0a560..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-german-cased/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# 🤗 + 📚 dbmdz German BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources another German BERT models 🎉
-
-# German BERT
-
-## Stats
-
-In addition to the recently released [German BERT](https://deepset.ai/german-bert)
-model by [deepset](https://deepset.ai/) we provide another German-language model.
-
-The source data for the model consists of a recent Wikipedia dump, EU Bookshop corpus,
-Open Subtitles, CommonCrawl, ParaCrawl and News Crawl. This results in a dataset with
-a size of 16GB and 2,350,234,427 tokens.
-
-For sentence splitting, we use [spacy](https://spacy.io/). Our preprocessing steps
-(sentence piece model for vocab generation) follow those used for training
-[SciBERT](https://github.com/allenai/scibert). The model is trained with an initial
-sequence length of 512 subwords and was performed for 1.5M steps.
-
-This release includes both cased and uncased models.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                            | Downloads
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `bert-base-german-dbmdz-cased`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json) • [`pytorch_model.bin`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin) • [`vocab.txt`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt)
-| `bert-base-german-dbmdz-uncased` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json) • [`pytorch_model.bin`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin) • [`vocab.txt`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt)
-
-## Usage
-
-With Transformers >= 2.3 our German BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")
-```
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/dbmdz/bert-base-german-uncased/README.md b/server/transformers/model_cards/dbmdz/bert-base-german-uncased/README.md
deleted file mode 100644
index fccd05054577d6633cf6c6ed2193e875b0a0a560..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-german-uncased/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# 🤗 + 📚 dbmdz German BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources another German BERT models 🎉
-
-# German BERT
-
-## Stats
-
-In addition to the recently released [German BERT](https://deepset.ai/german-bert)
-model by [deepset](https://deepset.ai/) we provide another German-language model.
-
-The source data for the model consists of a recent Wikipedia dump, EU Bookshop corpus,
-Open Subtitles, CommonCrawl, ParaCrawl and News Crawl. This results in a dataset with
-a size of 16GB and 2,350,234,427 tokens.
-
-For sentence splitting, we use [spacy](https://spacy.io/). Our preprocessing steps
-(sentence piece model for vocab generation) follow those used for training
-[SciBERT](https://github.com/allenai/scibert). The model is trained with an initial
-sequence length of 512 subwords and was performed for 1.5M steps.
-
-This release includes both cased and uncased models.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                            | Downloads
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `bert-base-german-dbmdz-cased`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json) • [`pytorch_model.bin`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin) • [`vocab.txt`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt)
-| `bert-base-german-dbmdz-uncased` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json) • [`pytorch_model.bin`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin) • [`vocab.txt`](https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt)
-
-## Usage
-
-With Transformers >= 2.3 our German BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")
-```
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/dbmdz/bert-base-italian-cased/README.md b/server/transformers/model_cards/dbmdz/bert-base-italian-cased/README.md
deleted file mode 100644
index 549c1133af281477b2b62101b39862cf010e8d2f..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-italian-cased/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 🤗 + 📚 dbmdz BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources Italian BERT models 🎉
-
-# Italian BERT
-
-The source data for the Italian BERT model consists of a recent Wikipedia dump and
-various texts from the [OPUS corpora](http://opus.nlpl.eu/) collection. The final
-training corpus has a size of 13GB and 2,050,057,573 tokens.
-
-For sentence splitting, we use NLTK (faster compared to spacy).
-Our cased and uncased models are training with an initial sequence length of 512
-subwords for ~2-3M steps.
-
-For the XXL Italian models, we use the same training data from OPUS and extend
-it with data from the Italian part of the [OSCAR corpus](https://traces1.inria.fr/oscar/).
-Thus, the final training corpus has a size of 81GB and 13,138,379,147 tokens.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                                   | Downloads
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `dbmdz/bert-base-italian-cased`         | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/config.json)       • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/pytorch_model.bin)       • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/vocab.txt)
-| `dbmdz/bert-base-italian-uncased`       | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/config.json)     • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/pytorch_model.bin)     • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-cased`     | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/config.json)   • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/pytorch_model.bin)   • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-uncased`   | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/vocab.txt)
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-## Usage
-
-With Transformers >= 2.3 our Italian BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-cased")
-```
-
-To load the (recommended) Italian XXL BERT models, just use:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-```
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/dbmdz/bert-base-italian-uncased/README.md b/server/transformers/model_cards/dbmdz/bert-base-italian-uncased/README.md
deleted file mode 100644
index 549c1133af281477b2b62101b39862cf010e8d2f..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-italian-uncased/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 🤗 + 📚 dbmdz BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources Italian BERT models 🎉
-
-# Italian BERT
-
-The source data for the Italian BERT model consists of a recent Wikipedia dump and
-various texts from the [OPUS corpora](http://opus.nlpl.eu/) collection. The final
-training corpus has a size of 13GB and 2,050,057,573 tokens.
-
-For sentence splitting, we use NLTK (faster compared to spacy).
-Our cased and uncased models are training with an initial sequence length of 512
-subwords for ~2-3M steps.
-
-For the XXL Italian models, we use the same training data from OPUS and extend
-it with data from the Italian part of the [OSCAR corpus](https://traces1.inria.fr/oscar/).
-Thus, the final training corpus has a size of 81GB and 13,138,379,147 tokens.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                                   | Downloads
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `dbmdz/bert-base-italian-cased`         | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/config.json)       • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/pytorch_model.bin)       • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/vocab.txt)
-| `dbmdz/bert-base-italian-uncased`       | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/config.json)     • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/pytorch_model.bin)     • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-cased`     | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/config.json)   • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/pytorch_model.bin)   • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-uncased`   | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/vocab.txt)
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-## Usage
-
-With Transformers >= 2.3 our Italian BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-cased")
-```
-
-To load the (recommended) Italian XXL BERT models, just use:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-```
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-cased/README.md b/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-cased/README.md
deleted file mode 100644
index 549c1133af281477b2b62101b39862cf010e8d2f..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-cased/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 🤗 + 📚 dbmdz BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources Italian BERT models 🎉
-
-# Italian BERT
-
-The source data for the Italian BERT model consists of a recent Wikipedia dump and
-various texts from the [OPUS corpora](http://opus.nlpl.eu/) collection. The final
-training corpus has a size of 13GB and 2,050,057,573 tokens.
-
-For sentence splitting, we use NLTK (faster compared to spacy).
-Our cased and uncased models are training with an initial sequence length of 512
-subwords for ~2-3M steps.
-
-For the XXL Italian models, we use the same training data from OPUS and extend
-it with data from the Italian part of the [OSCAR corpus](https://traces1.inria.fr/oscar/).
-Thus, the final training corpus has a size of 81GB and 13,138,379,147 tokens.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                                   | Downloads
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `dbmdz/bert-base-italian-cased`         | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/config.json)       • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/pytorch_model.bin)       • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/vocab.txt)
-| `dbmdz/bert-base-italian-uncased`       | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/config.json)     • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/pytorch_model.bin)     • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-cased`     | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/config.json)   • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/pytorch_model.bin)   • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-uncased`   | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/vocab.txt)
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-## Usage
-
-With Transformers >= 2.3 our Italian BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-cased")
-```
-
-To load the (recommended) Italian XXL BERT models, just use:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-```
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-uncased/README.md b/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-uncased/README.md
deleted file mode 100644
index 549c1133af281477b2b62101b39862cf010e8d2f..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/dbmdz/bert-base-italian-xxl-uncased/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 🤗 + 📚 dbmdz BERT models
-
-In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
-Library open sources Italian BERT models 🎉
-
-# Italian BERT
-
-The source data for the Italian BERT model consists of a recent Wikipedia dump and
-various texts from the [OPUS corpora](http://opus.nlpl.eu/) collection. The final
-training corpus has a size of 13GB and 2,050,057,573 tokens.
-
-For sentence splitting, we use NLTK (faster compared to spacy).
-Our cased and uncased models are training with an initial sequence length of 512
-subwords for ~2-3M steps.
-
-For the XXL Italian models, we use the same training data from OPUS and extend
-it with data from the Italian part of the [OSCAR corpus](https://traces1.inria.fr/oscar/).
-Thus, the final training corpus has a size of 81GB and 13,138,379,147 tokens.
-
-## Model weights
-
-Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
-compatible weights are available. If you need access to TensorFlow checkpoints,
-please raise an issue!
-
-| Model                                   | Downloads
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `dbmdz/bert-base-italian-cased`         | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/config.json)       • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/pytorch_model.bin)       • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-cased/vocab.txt)
-| `dbmdz/bert-base-italian-uncased`       | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/config.json)     • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/pytorch_model.bin)     • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-uncased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-cased`     | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/config.json)   • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/pytorch_model.bin)   • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-cased/vocab.txt)
-| `dbmdz/bert-base-italian-xxl-uncased`   | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-italian-xxl-uncased/vocab.txt)
-
-## Results
-
-For results on downstream tasks like NER or PoS tagging, please refer to
-[this repository](https://github.com/stefan-it/fine-tuned-berts-seq).
-
-## Usage
-
-With Transformers >= 2.3 our Italian BERT models can be loaded like:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-cased")
-```
-
-To load the (recommended) Italian XXL BERT models, just use:
-
-```python
-from transformers import AutoModel, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-model = AutoModel.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
-```
-
-# Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
-
-# Contact (Bugs, Feedback, Contribution and more)
-
-For questions about our BERT models just open an issue
-[here](https://github.com/dbmdz/berts/issues/new) 🤗
-
-# Acknowledgments
-
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
-Thanks for providing access to the TFRC ❤️
-
-Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
-it is possible to download both cased and uncased models from their S3 storage 🤗
diff --git a/server/transformers/model_cards/henryk/bert-base-multilingual-cased-finetuned-dutch-squad2/README.md b/server/transformers/model_cards/henryk/bert-base-multilingual-cased-finetuned-dutch-squad2/README.md
deleted file mode 100644
index 3d366061a4cb3ff9ade87e83382813a7b5de0855..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/henryk/bert-base-multilingual-cased-finetuned-dutch-squad2/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Multilingual + Dutch SQuAD2.0
-
-This model is the multilingual model provided by the Google research team with a fine-tuned dutch Q&A downstream task.
-
-## Details of the language model(bert-base-multilingual-cased)
-
-Language model ([**bert-base-multilingual-cased**](https://github.com/google-research/bert/blob/master/multilingual.md)):
-12-layer, 768-hidden, 12-heads, 110M parameters.
-Trained on cased text in the top 104 languages with the largest Wikipedias.
-
-## Details of the downstream task - Dataset
-Using the `mtranslate` Python module, [**SQuAD2.0**](https://rajpurkar.github.io/SQuAD-explorer/) was machine-translated. In order to find the start tokens the direct translations of the answers were searched in the corresponding paragraphs. Since the answer could not always be found in the text, due to the different translations depending on the context (missing context in the pure answer), a loss of question-answer examples occurred. This is a potential problem where errors can occur in the data set (but in the end it was a quick and dirty solution that worked well enough for my task).
-
-| Dataset                | # Q&A |
-| ---------------------- | ----- |
-| SQuAD2.0 Train         | 130 K |
-| Dutch SQuAD2.0 Train   | 99  K |
-| SQuAD2.0 Dev           | 12  K |
-| Dutch SQuAD2.0 Dev     | 10  K |
-
-## Model training
-
-The model was trained on a Tesla V100 GPU with the following command:
-
-```python
-export SQUAD_DIR=path/to/nl_squad
-
-python run_squad.py \
-  --model_type bert \
-  --model_name_or_path bert-base-multilingual-cased \
-  --version_2_with_negative \
-  --do_train \
-  --do_eval \
-  --train_file $SQUAD_DIR/train_nl-v2.0.json \
-  --predict_file $SQUAD_DIR/dev_nl-v2.0.json \
-  --per_gpu_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/output_dir/
-```
-
-**Results**:
-
-{'exact': **67.38**, 'f1': **71.36**} 
\ No newline at end of file
diff --git a/server/transformers/model_cards/jplu/tf-camembert-base/README.md b/server/transformers/model_cards/jplu/tf-camembert-base/README.md
deleted file mode 100644
index be8e1380e83936540e872f6f061f28380422f423..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/jplu/tf-camembert-base/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Tensorflow CamemBERT
-
-In this repository you will find different versions of the CamemBERT model for Tensorflow.
-
-## CamemBERT
-
-[CamemBERT](https://camembert-model.fr/) is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR.
-
-## Model Weights
-
-| Model                            | Downloads
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `jplu/tf-camembert-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/tf_model.h5)
-
-## Usage
-
-With Transformers >= 2.4 the Tensorflow models of CamemBERT can be loaded like:
-
-```python
-from transformers import TFCamembertModel
-
-model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
-```
-
-## Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
-
-## Acknowledgments
-
-Thanks to all the Huggingface team for the support and their amazing library!
diff --git a/server/transformers/model_cards/jplu/tf-xlm-roberta-base/README.md b/server/transformers/model_cards/jplu/tf-xlm-roberta-base/README.md
deleted file mode 100644
index 39569c71c9f83c5258ccc2c6a52de803decfbc38..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/jplu/tf-xlm-roberta-base/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Tensorflow XLM-RoBERTa
-
-In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow.
-
-## XLM-RoBERTa
-
-[XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks.
-
-## Model Weights
-
-| Model                            | Downloads
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `jplu/tf-xlm-roberta-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5)
-| `jplu/tf-xlm-roberta-large`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5)
-
-## Usage
-
-With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like:
-
-```python
-from transformers import TFXLMRobertaModel
-
-model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
-```
-Or
-```
-model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large")
-```
-
-## Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
-
-## Acknowledgments
-
-Thanks to all the Huggingface team for the support and their amazing library!
diff --git a/server/transformers/model_cards/jplu/tf-xlm-roberta-large/README.md b/server/transformers/model_cards/jplu/tf-xlm-roberta-large/README.md
deleted file mode 100644
index 39569c71c9f83c5258ccc2c6a52de803decfbc38..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/jplu/tf-xlm-roberta-large/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Tensorflow XLM-RoBERTa
-
-In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow.
-
-## XLM-RoBERTa
-
-[XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks.
-
-## Model Weights
-
-| Model                            | Downloads
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------
-| `jplu/tf-xlm-roberta-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5)
-| `jplu/tf-xlm-roberta-large`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5)
-
-## Usage
-
-With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like:
-
-```python
-from transformers import TFXLMRobertaModel
-
-model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
-```
-Or
-```
-model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large")
-```
-
-## Huggingface model hub
-
-All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
-
-## Acknowledgments
-
-Thanks to all the Huggingface team for the support and their amazing library!
diff --git a/server/transformers/model_cards/julien-c/bert-xsmall-dummy/README.md b/server/transformers/model_cards/julien-c/bert-xsmall-dummy/README.md
deleted file mode 100644
index 36eef6232722f15d84f08d414020550d1af36f9a..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/julien-c/bert-xsmall-dummy/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## How to build a dummy model
-
-
-```python
-from transformers.configuration_bert import BertConfig
-from transformers.modeling_bert import BertForMaskedLM
-from transformers.modeling_tf_bert import TFBertForMaskedLM
-from transformers.tokenization_bert import BertTokenizer
-
-
-SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
-DIRNAME = "./bert-xsmall-dummy"
-
-config = BertConfig(10, 20, 1, 1, 40)
-
-model = BertForMaskedLM(config)
-model.save_pretrained(DIRNAME)
-
-tf_model = TFBertForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
-tf_model.save_pretrained(DIRNAME)
-
-# Slightly different for tokenizer.
-# tokenizer = BertTokenizer.from_pretrained(DIRNAME)
-# tokenizer.save_pretrained()
-```
diff --git a/server/transformers/model_cards/julien-c/dummy-unknown/README.md b/server/transformers/model_cards/julien-c/dummy-unknown/README.md
deleted file mode 100644
index 9cdc3d24375813a747b340b31ece2a24a9124f39..0000000000000000000000000000000000000000
--- a/server/transformers/model_cards/julien-c/dummy-unknown/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-
-```python
-import json
-import os
-from transformers.configuration_roberta import RobertaConfig
-from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM
-
-DIRNAME = "./dummy-unknown"
-
-
-config = RobertaConfig(10, 20, 1, 1, 40)
-
-model = RobertaForMaskedLM(config)
-model.save_pretrained(DIRNAME)
-
-tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
-tf_model.save_pretrained(DIRNAME)
-
-# Tokenizer:
-
-vocab = [
-    "l",
-    "o",
-    "w",
-    "e",
-    "r",
-    "s",
-    "t",
-    "i",
-    "d",
-    "n",
-    "\u0120",
-    "\u0120l",
-    "\u0120n",
-    "\u0120lo",
-    "\u0120low",
-    "er",
-    "\u0120lowest",
-    "\u0120newer",
-    "\u0120wider",
-    "<unk>",
-]
-vocab_tokens = dict(zip(vocab, range(len(vocab))))
-merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-
-vocab_file = os.path.join(DIRNAME, "vocab.json")
-merges_file = os.path.join(DIRNAME, "merges.txt")
-with open(vocab_file, "w", encoding="utf-8") as fp:
-    fp.write(json.dumps(vocab_tokens) + "\n")
-with open(merges_file, "w", encoding="utf-8") as fp:
-    fp.write("\n".join(merges))
-```
diff --git a/server/transformers/notebooks/Comparing-PT-and-TF-models.ipynb b/server/transformers/notebooks/Comparing-PT-and-TF-models.ipynb
deleted file mode 100644
index 321c2ebe30e21531e894a8057e6c520736eb3b19..0000000000000000000000000000000000000000
--- a/server/transformers/notebooks/Comparing-PT-and-TF-models.ipynb
+++ /dev/null
@@ -1,1630 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Pytorch to Tensorflow Conversion Test Notebook\n",
-    "\n",
-    "To run this notebook follow these steps, modifying the **Config** section as necessary:\n",
-    "\n",
-    "1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n",
-    "2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n",
-    "\n",
-    "Note: \n",
-    "1. This feature currently only supports the base BERT models (uncased/cased).\n",
-    "2. Tensorflow model will be dumped in `tf_model_dir`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "model_cls  = 'BertModel'\n",
-    "model_typ  = 'bert-base-uncased'\n",
-    "token_cls  = 'BertTokenizer'\n",
-    "max_seq    = 12\n",
-    "CLS        = \"[CLS]\"\n",
-    "SEP        = \"[SEP]\"\n",
-    "MASK       = \"[MASK]\"\n",
-    "CLS_IDX    = 0\n",
-    "layer_idxs = tuple(range(12))\n",
-    "input_text = \"jim henson was a puppeteer\"\n",
-    "\n",
-    "pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n",
-    "tf_bert_dir  = \"/home/ubuntu/bert\"\n",
-    "\n",
-    "pt_vocab_file  = os.path.join(pt_model_dir, \"vocab.txt\")\n",
-    "pt_init_ckpt   = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n",
-    "tf_model_dir   = os.path.join(pt_model_dir, 'tf')\n",
-    "tf_vocab_file  = os.path.join(tf_model_dir, \"vocab.txt\")\n",
-    "tf_init_ckpt   = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n",
-    "tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n",
-    "\n",
-    "if not os.path.isdir(tf_model_dir): \n",
-    "    os.makedirs(tf_model_dir, exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tokenization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def tokenize(text, tokenizer):\n",
-    "    text = text.strip().lower()\n",
-    "    tok_ids = tokenizer.tokenize(text)\n",
-    "    if len(tok_ids) > max_seq - 2:\n",
-    "        tok_ids = tok_ids[:max_seq - 2]\n",
-    "    tok_ids.insert(CLS_IDX, CLS)\n",
-    "    tok_ids.append(SEP)\n",
-    "    input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n",
-    "    mask_ids = [1] * len(input_ids)\n",
-    "    seg_ids = [0] * len(input_ids)\n",
-    "    padding = [0] * (max_seq - len(input_ids))\n",
-    "    input_ids += padding\n",
-    "    mask_ids += padding\n",
-    "    seg_ids += padding\n",
-    "    return input_ids, mask_ids, seg_ids"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pytorch execution"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n",
-      "100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Pytorch embedding shape: (1, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "import torch\n",
-    "from pytorch_pretrained_bert import (BertConfig,\n",
-    "                                     BertModel, \n",
-    "                                     BertTokenizer, \n",
-    "                                     BertForSequenceClassification)\n",
-    "\n",
-    "# Save Vocab\n",
-    "pt_tokenizer = BertTokenizer.from_pretrained(\n",
-    "    pretrained_model_name_or_path=model_typ, \n",
-    "    cache_dir=pt_model_dir)\n",
-    "pt_tokenizer.save_vocabulary(pt_model_dir)\n",
-    "pt_tokenizer.save_vocabulary(tf_model_dir)\n",
-    "\n",
-    "# Save Model\n",
-    "pt_model = BertModel.from_pretrained(\n",
-    "    pretrained_model_name_or_path=model_typ, \n",
-    "    cache_dir=pt_model_dir).to('cpu')\n",
-    "pt_model.eval()\n",
-    "pt_model.config.hidden_dropout_prob = 0.0\n",
-    "pt_model.config.attention_probs_dropout_prob = 0.0\n",
-    "pt_model.config.to_json_file(tf_config_file)\n",
-    "torch.save(pt_model.state_dict(), pt_init_ckpt)\n",
-    "\n",
-    "# Inputs\n",
-    "input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n",
-    "\n",
-    "# PT Embedding\n",
-    "tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n",
-    "seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n",
-    "msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n",
-    "attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n",
-    "pt_embedding = nsp_logits.detach().numpy() \n",
-    "print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Pytorch &rarr; Tensorflow conversion"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Colocations handled automatically by placer.\n",
-      "bert/embeddings/word_embeddings                             initialized\n",
-      "bert/embeddings/position_embeddings                         initialized\n",
-      "bert/embeddings/token_type_embeddings                       initialized\n",
-      "bert/embeddings/LayerNorm/gamma                             initialized\n",
-      "bert/embeddings/LayerNorm/beta                              initialized\n",
-      "bert/encoder/layer_0/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_0/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_0/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_0/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_0/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_0/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_0/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_0/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_0/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_0/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_0/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_0/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_0/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_0/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_1/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_1/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_1/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_1/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_1/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_1/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_1/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_1/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_1/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_1/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_1/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_1/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_1/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_1/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_2/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_2/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_2/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_2/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_2/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_2/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_2/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_2/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_2/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_2/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_2/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_2/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_2/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_2/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_3/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_3/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_3/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_3/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_3/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_3/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_3/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_3/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_3/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_3/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_3/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_3/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_3/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_3/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_4/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_4/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_4/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_4/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_4/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_4/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_4/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_4/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_4/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_4/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_4/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_4/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_4/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_4/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_5/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_5/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_5/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_5/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_5/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_5/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_5/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_5/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_5/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_5/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_5/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_5/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_5/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_5/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_6/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_6/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_6/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_6/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_6/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_6/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_6/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_6/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_6/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_6/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_6/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_6/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_6/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_6/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_7/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_7/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_7/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_7/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_7/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_7/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_7/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_7/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_7/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_7/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_7/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_7/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_7/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_7/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_8/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_8/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_8/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_8/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_8/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_8/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_8/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_8/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_8/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_8/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_8/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_8/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_8/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_8/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_9/attention/self/query/kernel            initialized\n",
-      "bert/encoder/layer_9/attention/self/query/bias              initialized\n",
-      "bert/encoder/layer_9/attention/self/key/kernel              initialized\n",
-      "bert/encoder/layer_9/attention/self/key/bias                initialized\n",
-      "bert/encoder/layer_9/attention/self/value/kernel            initialized\n",
-      "bert/encoder/layer_9/attention/self/value/bias              initialized\n",
-      "bert/encoder/layer_9/attention/output/dense/kernel          initialized\n",
-      "bert/encoder/layer_9/attention/output/dense/bias            initialized\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/gamma       initialized\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/beta        initialized\n",
-      "bert/encoder/layer_9/intermediate/dense/kernel              initialized\n",
-      "bert/encoder/layer_9/intermediate/dense/bias                initialized\n",
-      "bert/encoder/layer_9/output/dense/kernel                    initialized\n",
-      "bert/encoder/layer_9/output/dense/bias                      initialized\n",
-      "bert/encoder/layer_9/output/LayerNorm/gamma                 initialized\n",
-      "bert/encoder/layer_9/output/LayerNorm/beta                  initialized\n",
-      "bert/encoder/layer_10/attention/self/query/kernel           initialized\n",
-      "bert/encoder/layer_10/attention/self/query/bias             initialized\n",
-      "bert/encoder/layer_10/attention/self/key/kernel             initialized\n",
-      "bert/encoder/layer_10/attention/self/key/bias               initialized\n",
-      "bert/encoder/layer_10/attention/self/value/kernel           initialized\n",
-      "bert/encoder/layer_10/attention/self/value/bias             initialized\n",
-      "bert/encoder/layer_10/attention/output/dense/kernel         initialized\n",
-      "bert/encoder/layer_10/attention/output/dense/bias           initialized\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/gamma      initialized\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/beta       initialized\n",
-      "bert/encoder/layer_10/intermediate/dense/kernel             initialized\n",
-      "bert/encoder/layer_10/intermediate/dense/bias               initialized\n",
-      "bert/encoder/layer_10/output/dense/kernel                   initialized\n",
-      "bert/encoder/layer_10/output/dense/bias                     initialized\n",
-      "bert/encoder/layer_10/output/LayerNorm/gamma                initialized\n",
-      "bert/encoder/layer_10/output/LayerNorm/beta                 initialized\n",
-      "bert/encoder/layer_11/attention/self/query/kernel           initialized\n",
-      "bert/encoder/layer_11/attention/self/query/bias             initialized\n",
-      "bert/encoder/layer_11/attention/self/key/kernel             initialized\n",
-      "bert/encoder/layer_11/attention/self/key/bias               initialized\n",
-      "bert/encoder/layer_11/attention/self/value/kernel           initialized\n",
-      "bert/encoder/layer_11/attention/self/value/bias             initialized\n",
-      "bert/encoder/layer_11/attention/output/dense/kernel         initialized\n",
-      "bert/encoder/layer_11/attention/output/dense/bias           initialized\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/gamma      initialized\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/beta       initialized\n",
-      "bert/encoder/layer_11/intermediate/dense/kernel             initialized\n",
-      "bert/encoder/layer_11/intermediate/dense/bias               initialized\n",
-      "bert/encoder/layer_11/output/dense/kernel                   initialized\n",
-      "bert/encoder/layer_11/output/dense/bias                     initialized\n",
-      "bert/encoder/layer_11/output/LayerNorm/gamma                initialized\n",
-      "bert/encoder/layer_11/output/LayerNorm/beta                 initialized\n",
-      "bert/pooler/dense/kernel                                    initialized\n",
-      "bert/pooler/dense/bias                                      initialized\n"
-     ]
-    }
-   ],
-   "source": [
-    "from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n",
-    "\n",
-    "main([\n",
-    "    '--model_name', model_typ, \n",
-    "    '--pytorch_model_path', pt_init_ckpt,\n",
-    "    '--tf_cache_dir', tf_model_dir,\n",
-    "    '--cache_dir', pt_model_dir\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Tensorflow execution"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
-      "For more information, please see:\n",
-      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
-      "  * https://github.com/tensorflow/addons\n",
-      "If you depend on functionality not listed there, please file an issue.\n",
-      "\n",
-      "WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Use keras.layers.dense instead.\n",
-      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Use standard file APIs to check for files with this prefix.\n",
-      "INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n",
-      "Tensorflow embedding shape: (1, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import tensorflow as tf\n",
-    "sys.path.insert(0, tf_bert_dir)\n",
-    "import modeling\n",
-    "import tokenization\n",
-    "\n",
-    "tf.reset_default_graph()\n",
-    "\n",
-    "# Process text\n",
-    "tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n",
-    "\n",
-    "# Graph inputs\n",
-    "input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n",
-    "config = modeling.BertConfig.from_json_file(\n",
-    "    os.path.join(tf_model_dir, 'bert_config.json'))\n",
-    "input_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='input_ids')\n",
-    "mask_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='mask_ids')\n",
-    "seg_tensor = tf.placeholder(\n",
-    "    dtype=tf.int32,\n",
-    "    shape=[1, None],\n",
-    "    name='seg_ids')\n",
-    "tf_model = modeling.BertModel(\n",
-    "    config=config,\n",
-    "    is_training=False,\n",
-    "    input_ids=input_tensor,\n",
-    "    input_mask=mask_tensor,\n",
-    "    token_type_ids=seg_tensor,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "output_layer = tf_model.get_pooled_output()\n",
-    "\n",
-    "# Load tf model\n",
-    "session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
-    "vars_to_load = [v for v in tf.global_variables()]\n",
-    "session.run(tf.variables_initializer(var_list=vars_to_load))\n",
-    "saver = tf.train.Saver(vars_to_load)\n",
-    "saver.restore(session, save_path=tf_init_ckpt)\n",
-    "\n",
-    "# TF Embedding\n",
-    "fetches = output_layer\n",
-    "feed_dict  = {\n",
-    "    input_tensor: [input_ids_tf],\n",
-    "    mask_tensor: [mask_ids_tf],\n",
-    "    seg_tensor: [seg_ids_tf]\n",
-    "}\n",
-    "tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n",
-    "print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Tokenization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
-      "TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
-      "SEG_IDS_PT:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "SEG_IDS_TF:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "MASK_IDS_PT:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
-      "MASK_IDS_TF:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n",
-    "print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n",
-    "print(\"SEG_IDS_PT:   {}\".format(seg_ids_pt))\n",
-    "print(\"SEG_IDS_TF:   {}\".format(seg_ids_tf))\n",
-    "print(\"MASK_IDS_PT:  {}\".format(mask_ids_pt))\n",
-    "print(\"MASK_IDS_TF:  {}\".format(mask_ids_tf))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Model Weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bert/embeddings/word_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
-      "TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
-      "\n",
-      "bert/embeddings/token_type_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
-      "TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
-      "\n",
-      "bert/embeddings/position_embeddings\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
-      "TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
-      "\n",
-      "bert/embeddings/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
-      "TF: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
-      "\n",
-      "bert/embeddings/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
-      "TF: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
-      "TF: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
-      "TF: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
-      "TF: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
-      "TF: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
-      "TF: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
-      "TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
-      "TF: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
-      "TF: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
-      "TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
-      "\n",
-      "bert/encoder/layer_0/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
-      "TF: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
-      "\n",
-      "bert/encoder/layer_0/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
-      "TF: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
-      "\n",
-      "bert/encoder/layer_0/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
-      "TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
-      "\n",
-      "bert/encoder/layer_0/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
-      "TF: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
-      "\n",
-      "bert/encoder/layer_0/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
-      "TF: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
-      "\n",
-      "bert/encoder/layer_0/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
-      "TF: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
-      "\n",
-      "bert/encoder/layer_0/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
-      "TF: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
-      "TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
-      "TF: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
-      "TF: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
-      "TF: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
-      "TF: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
-      "TF: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
-      "TF: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
-      "TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
-      "TF: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
-      "\n",
-      "bert/encoder/layer_1/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
-      "TF: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
-      "\n",
-      "bert/encoder/layer_1/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
-      "TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
-      "\n",
-      "bert/encoder/layer_1/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
-      "TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
-      "\n",
-      "bert/encoder/layer_1/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
-      "TF: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
-      "\n",
-      "bert/encoder/layer_1/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
-      "TF: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
-      "\n",
-      "bert/encoder/layer_1/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
-      "TF: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
-      "\n",
-      "bert/encoder/layer_1/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
-      "TF: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
-      "TF: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
-      "TF: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
-      "TF: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
-      "TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
-      "TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
-      "TF: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
-      "TF: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
-      "TF: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
-      "TF: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
-      "\n",
-      "bert/encoder/layer_2/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
-      "TF: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
-      "\n",
-      "bert/encoder/layer_2/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
-      "TF: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
-      "\n",
-      "bert/encoder/layer_2/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
-      "TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
-      "\n",
-      "bert/encoder/layer_2/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
-      "TF: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
-      "\n",
-      "bert/encoder/layer_2/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
-      "TF: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
-      "\n",
-      "bert/encoder/layer_2/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
-      "TF: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
-      "\n",
-      "bert/encoder/layer_2/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
-      "TF: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
-      "TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
-      "TF: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
-      "TF: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
-      "TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
-      "TF: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
-      "TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
-      "TF: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
-      "TF: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
-      "TF: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
-      "\n",
-      "bert/encoder/layer_3/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
-      "TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
-      "\n",
-      "bert/encoder/layer_3/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
-      "TF: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
-      "\n",
-      "bert/encoder/layer_3/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
-      "TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
-      "\n",
-      "bert/encoder/layer_3/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
-      "TF: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
-      "\n",
-      "bert/encoder/layer_3/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
-      "TF: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
-      "\n",
-      "bert/encoder/layer_3/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
-      "TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
-      "\n",
-      "bert/encoder/layer_3/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
-      "TF: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
-      "TF: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
-      "TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
-      "TF: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
-      "TF: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
-      "TF: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
-      "TF: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
-      "TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
-      "TF: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
-      "TF: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
-      "\n",
-      "bert/encoder/layer_4/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
-      "TF: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
-      "\n",
-      "bert/encoder/layer_4/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
-      "TF: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
-      "\n",
-      "bert/encoder/layer_4/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
-      "TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
-      "\n",
-      "bert/encoder/layer_4/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
-      "TF: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
-      "\n",
-      "bert/encoder/layer_4/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
-      "TF: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
-      "\n",
-      "bert/encoder/layer_4/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
-      "TF: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
-      "\n",
-      "bert/encoder/layer_4/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
-      "TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
-      "TF: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
-      "TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
-      "TF: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
-      "TF: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
-      "TF: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
-      "TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
-      "TF: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
-      "TF: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
-      "TF: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
-      "\n",
-      "bert/encoder/layer_5/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
-      "TF: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
-      "\n",
-      "bert/encoder/layer_5/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
-      "TF: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
-      "\n",
-      "bert/encoder/layer_5/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
-      "TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
-      "\n",
-      "bert/encoder/layer_5/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
-      "TF: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
-      "\n",
-      "bert/encoder/layer_5/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
-      "TF: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
-      "\n",
-      "bert/encoder/layer_5/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
-      "TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
-      "\n",
-      "bert/encoder/layer_5/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
-      "TF: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
-      "TF: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
-      "TF: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
-      "TF: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
-      "TF: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
-      "TF: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
-      "TF: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
-      "TF: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
-      "TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
-      "TF: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
-      "\n",
-      "bert/encoder/layer_6/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
-      "TF: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
-      "\n",
-      "bert/encoder/layer_6/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
-      "TF: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
-      "\n",
-      "bert/encoder/layer_6/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
-      "TF: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
-      "\n",
-      "bert/encoder/layer_6/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
-      "TF: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
-      "\n",
-      "bert/encoder/layer_6/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
-      "TF: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
-      "\n",
-      "bert/encoder/layer_6/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
-      "TF: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
-      "\n",
-      "bert/encoder/layer_6/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
-      "TF: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
-      "TF: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
-      "TF: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
-      "TF: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
-      "TF: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
-      "TF: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
-      "TF: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
-      "TF: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
-      "TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
-      "TF: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
-      "\n",
-      "bert/encoder/layer_7/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
-      "TF: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
-      "\n",
-      "bert/encoder/layer_7/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
-      "TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
-      "\n",
-      "bert/encoder/layer_7/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
-      "TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
-      "\n",
-      "bert/encoder/layer_7/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
-      "TF: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
-      "\n",
-      "bert/encoder/layer_7/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
-      "TF: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
-      "\n",
-      "bert/encoder/layer_7/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
-      "TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
-      "\n",
-      "bert/encoder/layer_7/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
-      "TF: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
-      "TF: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
-      "TF: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
-      "TF: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
-      " -4.4074579e-04]\n",
-      "TF: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
-      " -4.4074579e-04]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
-      "TF: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
-      "TF: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
-      "TF: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
-      "TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
-      "TF: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
-      "\n",
-      "bert/encoder/layer_8/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
-      "TF: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
-      "\n",
-      "bert/encoder/layer_8/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
-      "TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
-      "\n",
-      "bert/encoder/layer_8/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
-      "TF: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
-      "\n",
-      "bert/encoder/layer_8/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
-      "TF: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
-      "\n",
-      "bert/encoder/layer_8/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
-      "TF: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
-      "\n",
-      "bert/encoder/layer_8/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
-      "TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
-      "\n",
-      "bert/encoder/layer_8/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
-      "TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
-      "TF: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
-      "TF: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
-      "TF: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
-      "TF: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
-      "TF: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
-      "TF: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
-      "TF: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
-      "TF: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
-      "TF: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
-      "\n",
-      "bert/encoder/layer_9/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
-      "TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
-      "\n",
-      "bert/encoder/layer_9/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
-      "TF: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
-      "\n",
-      "bert/encoder/layer_9/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
-      "TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
-      "\n",
-      "bert/encoder/layer_9/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
-      "TF: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
-      "\n",
-      "bert/encoder/layer_9/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
-      "TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
-      "\n",
-      "bert/encoder/layer_9/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
-      "TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
-      "\n",
-      "bert/encoder/layer_9/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
-      "TF: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
-      "TF: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
-      "TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
-      "TF: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
-      "TF: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
-      "TF: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
-      "TF: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
-      "TF: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
-      "TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
-      "TF: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
-      "\n",
-      "bert/encoder/layer_10/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
-      "TF: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
-      "\n",
-      "bert/encoder/layer_10/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
-      "TF: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
-      "\n",
-      "bert/encoder/layer_10/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
-      "TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
-      "\n",
-      "bert/encoder/layer_10/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
-      "TF: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
-      "\n",
-      "bert/encoder/layer_10/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
-      "TF: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
-      "\n",
-      "bert/encoder/layer_10/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
-      "TF: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
-      "\n",
-      "bert/encoder/layer_10/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
-      "TF: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/query/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
-      "TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/query/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
-      "TF: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/key/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
-      "TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/key/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
-      "TF: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/value/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
-      "TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/self/value/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
-      "TF: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
-      "TF: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
-      "TF: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
-      "TF: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
-      "\n",
-      "bert/encoder/layer_11/attention/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
-      "TF: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
-      "\n",
-      "bert/encoder/layer_11/intermediate/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
-      "TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
-      "\n",
-      "bert/encoder/layer_11/intermediate/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
-      "TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
-      "\n",
-      "bert/encoder/layer_11/output/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
-      "TF: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
-      "\n",
-      "bert/encoder/layer_11/output/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
-      "TF: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
-      "\n",
-      "bert/encoder/layer_11/output/LayerNorm/beta\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
-      "TF: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
-      "\n",
-      "bert/encoder/layer_11/output/LayerNorm/gamma\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
-      "TF: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
-      "\n",
-      "bert/pooler/dense/kernel\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
-      "TF: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
-      "\n",
-      "bert/pooler/dense/bias\n",
-      "|sum(pt_wts - tf_wts)| = 0.0\n",
-      "PT: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
-      "TF: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensors_to_transopse = (\n",
-    "    \"dense.weight\",\n",
-    "    \"attention.self.query\",\n",
-    "    \"attention.self.key\",\n",
-    "    \"attention.self.value\"\n",
-    ")\n",
-    "var_map = (\n",
-    "    ('layer.', 'layer_'),\n",
-    "    ('word_embeddings.weight', 'word_embeddings'),\n",
-    "    ('position_embeddings.weight', 'position_embeddings'),\n",
-    "    ('token_type_embeddings.weight', 'token_type_embeddings'),\n",
-    "    ('.', '/'),\n",
-    "    ('LayerNorm/weight', 'LayerNorm/gamma'),\n",
-    "    ('LayerNorm/bias', 'LayerNorm/beta'),\n",
-    "    ('weight', 'kernel')\n",
-    ")\n",
-    "\n",
-    "def to_tf_var_name(name:str):\n",
-    "    for patt, repl in iter(var_map):\n",
-    "        name = name.replace(patt, repl)\n",
-    "    return 'bert/{}'.format(name)\n",
-    "\n",
-    "tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n",
-    "pt_vars = {}\n",
-    "for v, T in pt_model.state_dict().items():\n",
-    "    T = T.detach().numpy()\n",
-    "    if any([x in v for x in tensors_to_transopse]):\n",
-    "        T = T.T\n",
-    "    pt_vars.update({to_tf_var_name(v): T})\n",
-    "\n",
-    "for var_name in tf_vars:\n",
-    "    \n",
-    "    pt = pt_vars[var_name.strip(\":0\")]\n",
-    "    tf = tf_vars[var_name]\n",
-    "\n",
-    "    print(var_name.strip(\":0\"))\n",
-    "    \n",
-    "    # Assert equivalence\n",
-    "    print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n",
-    "        np.abs(np.sum(pt - tf, keepdims=False))\n",
-    "    ))\n",
-    "    assert not np.sum(pt - tf, keepdims=False)\n",
-    "    \n",
-    "    if len(pt.shape) == 2:\n",
-    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n",
-    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n",
-    "    else:\n",
-    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n",
-    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare Layer-12 Projections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "MSE: 2.7155439966009e-05\n",
-      "PT-values: [-0.876663   -0.41088238 -0.12200808  0.44941     0.19445966]\n",
-      "TF-values: [-0.8742865  -0.40621698 -0.10585472  0.444904    0.1825743 ]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Mean Squared Error (MSE) between last projection of each model\n",
-    "MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n",
-    "print(\"MSE: {}\".format(MSE))\n",
-    "print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n",
-    "print(\"TF-values: {}\".format(tf_embedding[0, :5]))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "nlp",
-   "language": "python",
-   "name": "nlp"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/server/transformers/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/server/transformers/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
deleted file mode 100644
index 809f6ea6e0f3267e50d01ee6aedee5d6316f2665..0000000000000000000000000000000000000000
--- a/server/transformers/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ /dev/null
@@ -1,4815 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch models\n",
-    "\n",
-    "You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:26.999106Z",
-     "start_time": "2018-11-16T10:02:26.985709Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:27.664528Z",
-     "start_time": "2018-11-16T10:02:27.651019Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"./samples/input.txt\"\n",
-    "max_seq_length = 128\n",
-    "max_predictions_per_seq = 20\n",
-    "\n",
-    "masked_lm_positions = [6]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.202182Z",
-     "start_time": "2018-11-16T10:02:28.112570Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "import tensorflow as tf\n",
-    "import pytorch_transformers as ppb\n",
-    "\n",
-    "def del_all_flags(FLAGS):\n",
-    "    flags_dict = FLAGS._flags()    \n",
-    "    keys_list = [keys for keys in flags_dict]    \n",
-    "    for keys in keys_list:\n",
-    "        FLAGS.__delattr__(keys)\n",
-    "\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.extract_features as ef\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.modeling as tfm\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.tokenization as tft\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.run_pretraining as rp\n",
-    "del_all_flags(tf.flags.FLAGS)\n",
-    "import tensorflow_code.create_pretraining_data as cpp"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.238027Z",
-     "start_time": "2018-11-16T10:02:30.204943Z"
-    },
-    "code_folding": [
-     15
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "class InputExample(object):\n",
-    "    \"\"\"A single instance example.\"\"\"\n",
-    "\n",
-    "    def __init__(self, tokens, segment_ids, masked_lm_positions,\n",
-    "                 masked_lm_labels, is_random_next):\n",
-    "        self.tokens = tokens\n",
-    "        self.segment_ids = segment_ids\n",
-    "        self.masked_lm_positions = masked_lm_positions\n",
-    "        self.masked_lm_labels = masked_lm_labels\n",
-    "        self.is_random_next = is_random_next\n",
-    "    def __repr__(self):\n",
-    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
-    "\n",
-    "\n",
-    "def read_examples(input_file, tokenizer, masked_lm_positions):\n",
-    "    \"\"\"Read a list of `InputExample`s from an input file.\"\"\"\n",
-    "    examples = []\n",
-    "    unique_id = 0\n",
-    "    with tf.gfile.GFile(input_file, \"r\") as reader:\n",
-    "        while True:\n",
-    "            line = reader.readline()\n",
-    "            if not line:\n",
-    "                break\n",
-    "            line = line.strip()\n",
-    "            text_a = None\n",
-    "            text_b = None\n",
-    "            m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n",
-    "            if m is None:\n",
-    "                text_a = line\n",
-    "            else:\n",
-    "                text_a = m.group(1)\n",
-    "                text_b = m.group(2)\n",
-    "            tokens_a = tokenizer.tokenize(text_a)\n",
-    "            tokens_b = None\n",
-    "            if text_b:\n",
-    "                tokens_b = tokenizer.tokenize(text_b)\n",
-    "            tokens = tokens_a + tokens_b\n",
-    "            masked_lm_labels = []\n",
-    "            for m_pos in masked_lm_positions:\n",
-    "                masked_lm_labels.append(tokens[m_pos])\n",
-    "                tokens[m_pos] = '[MASK]'\n",
-    "            examples.append(\n",
-    "                InputExample(\n",
-    "                    tokens = tokens,\n",
-    "                    segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b),\n",
-    "                    masked_lm_positions = masked_lm_positions,\n",
-    "                    masked_lm_labels = masked_lm_labels,\n",
-    "                    is_random_next = False))\n",
-    "            unique_id += 1\n",
-    "    return examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:30.304018Z",
-     "start_time": "2018-11-16T10:02:30.240189Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tokens:['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
-      "masked_lm_positions:[6]\n",
-      "masked_lm_labels:['henson']\n",
-      "is_random_next:False\n"
-     ]
-    }
-   ],
-   "source": [
-    "bert_config = tfm.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = ppb.BertTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "examples = read_examples(input_file, tokenizer, masked_lm_positions=masked_lm_positions)\n",
-    "\n",
-    "print(examples[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:33.324167Z",
-     "start_time": "2018-11-16T10:02:33.291909Z"
-    },
-    "code_folding": [
-     16
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "class InputFeatures(object):\n",
-    "    \"\"\"A single set of features of data.\"\"\"\n",
-    "\n",
-    "    def __init__(self, input_ids, input_mask, segment_ids, masked_lm_positions,\n",
-    "                 masked_lm_ids, masked_lm_weights, next_sentence_label):\n",
-    "        self.input_ids = input_ids\n",
-    "        self.input_mask = input_mask\n",
-    "        self.segment_ids = segment_ids\n",
-    "        self.masked_lm_positions = masked_lm_positions\n",
-    "        self.masked_lm_ids = masked_lm_ids\n",
-    "        self.masked_lm_weights = masked_lm_weights\n",
-    "        self.next_sentence_labels = next_sentence_label\n",
-    "\n",
-    "    def __repr__(self):\n",
-    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
-    "\n",
-    "def pretraining_convert_examples_to_features(instances, tokenizer, max_seq_length,\n",
-    "                                 max_predictions_per_seq):\n",
-    "    \"\"\"Create TF example files from `TrainingInstance`s.\"\"\"\n",
-    "    features = []\n",
-    "    for (inst_index, instance) in enumerate(instances):\n",
-    "        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)\n",
-    "        input_mask = [1] * len(input_ids)\n",
-    "        segment_ids = list(instance.segment_ids)\n",
-    "        assert len(input_ids) <= max_seq_length\n",
-    "\n",
-    "        while len(input_ids) < max_seq_length:\n",
-    "            input_ids.append(0)\n",
-    "            input_mask.append(0)\n",
-    "            segment_ids.append(0)\n",
-    "\n",
-    "        assert len(input_ids) == max_seq_length\n",
-    "        assert len(input_mask) == max_seq_length\n",
-    "        assert len(segment_ids) == max_seq_length\n",
-    "\n",
-    "        masked_lm_positions = list(instance.masked_lm_positions)\n",
-    "        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)\n",
-    "        masked_lm_weights = [1.0] * len(masked_lm_ids)\n",
-    "\n",
-    "        while len(masked_lm_positions) < max_predictions_per_seq:\n",
-    "            masked_lm_positions.append(0)\n",
-    "            masked_lm_ids.append(0)\n",
-    "            masked_lm_weights.append(0.0)\n",
-    "\n",
-    "        next_sentence_label = 1 if instance.is_random_next else 0\n",
-    "\n",
-    "        features.append(\n",
-    "            InputFeatures(input_ids, input_mask, segment_ids,\n",
-    "                          masked_lm_positions, masked_lm_ids,\n",
-    "                          masked_lm_weights, next_sentence_label))\n",
-    "\n",
-    "        if inst_index < 5:\n",
-    "            tf.logging.info(\"*** Example ***\")\n",
-    "            tf.logging.info(\"tokens: %s\" % \" \".join(\n",
-    "                [str(x) for x in instance.tokens]))\n",
-    "            tf.logging.info(\"features: %s\" % str(features[-1]))\n",
-    "    return features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:34.185367Z",
-     "start_time": "2018-11-16T10:02:34.155046Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   *** Example ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
-      "next_sentence_labels:0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:34 - INFO - tensorflow -   features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
-      "next_sentence_labels:0\n"
-     ]
-    }
-   ],
-   "source": [
-    "features = pretraining_convert_examples_to_features(\n",
-    "    instances=examples, max_seq_length=max_seq_length, \n",
-    "    max_predictions_per_seq=max_predictions_per_seq, tokenizer=tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:34.912005Z",
-     "start_time": "2018-11-16T10:02:34.882111Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def input_fn_builder(features, seq_length, max_predictions_per_seq, tokenizer):\n",
-    "    \"\"\"Creates an `input_fn` closure to be passed to TPUEstimator.\"\"\"\n",
-    "\n",
-    "    all_input_ids = []\n",
-    "    all_input_mask = []\n",
-    "    all_segment_ids = []\n",
-    "    all_masked_lm_positions = []\n",
-    "    all_masked_lm_ids = []\n",
-    "    all_masked_lm_weights = []\n",
-    "    all_next_sentence_labels = []\n",
-    "\n",
-    "    for feature in features:\n",
-    "        all_input_ids.append(feature.input_ids)\n",
-    "        all_input_mask.append(feature.input_mask)\n",
-    "        all_segment_ids.append(feature.segment_ids)\n",
-    "        all_masked_lm_positions.append(feature.masked_lm_positions)\n",
-    "        all_masked_lm_ids.append(feature.masked_lm_ids)\n",
-    "        all_masked_lm_weights.append(feature.masked_lm_weights)\n",
-    "        all_next_sentence_labels.append(feature.next_sentence_labels)\n",
-    "\n",
-    "    def input_fn(params):\n",
-    "        \"\"\"The actual input function.\"\"\"\n",
-    "        batch_size = params[\"batch_size\"]\n",
-    "\n",
-    "        num_examples = len(features)\n",
-    "\n",
-    "        # This is for demo purposes and does NOT scale to large data sets. We do\n",
-    "        # not use Dataset.from_generator() because that uses tf.py_func which is\n",
-    "        # not TPU compatible. The right way to load data is with TFRecordReader.\n",
-    "        d = tf.data.Dataset.from_tensor_slices({\n",
-    "            \"input_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_input_ids, shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"input_mask\":\n",
-    "                tf.constant(\n",
-    "                    all_input_mask,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"segment_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_segment_ids,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"masked_lm_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_positions,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.int32),\n",
-    "        \"masked_lm_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_ids,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.int32),\n",
-    "        \"masked_lm_weights\":\n",
-    "                tf.constant(\n",
-    "                    all_masked_lm_weights,\n",
-    "                    shape=[num_examples, max_predictions_per_seq],\n",
-    "                    dtype=tf.float32),\n",
-    "        \"next_sentence_labels\":\n",
-    "                tf.constant(\n",
-    "                    all_next_sentence_labels,\n",
-    "                    shape=[num_examples, 1],\n",
-    "                    dtype=tf.int32),\n",
-    "        })\n",
-    "\n",
-    "        d = d.batch(batch_size=batch_size, drop_remainder=False)\n",
-    "        return d\n",
-    "\n",
-    "    return input_fn\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:35.671603Z",
-     "start_time": "2018-11-16T10:02:35.626167Z"
-    },
-    "code_folding": [
-     64,
-     77
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "def model_fn_builder(bert_config, init_checkpoint, learning_rate,\n",
-    "                     num_train_steps, num_warmup_steps, use_tpu,\n",
-    "                     use_one_hot_embeddings):\n",
-    "    \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
-    "\n",
-    "    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
-    "        \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
-    "\n",
-    "        tf.logging.info(\"*** Features ***\")\n",
-    "        for name in sorted(features.keys()):\n",
-    "            tf.logging.info(\"  name = %s, shape = %s\" % (name, features[name].shape))\n",
-    "\n",
-    "        input_ids = features[\"input_ids\"]\n",
-    "        input_mask = features[\"input_mask\"]\n",
-    "        segment_ids = features[\"segment_ids\"]\n",
-    "        masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "        masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "        masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "        next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "\n",
-    "        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
-    "\n",
-    "        model = tfm.BertModel(\n",
-    "            config=bert_config,\n",
-    "            is_training=is_training,\n",
-    "            input_ids=input_ids,\n",
-    "            input_mask=input_mask,\n",
-    "            token_type_ids=segment_ids,\n",
-    "            use_one_hot_embeddings=use_one_hot_embeddings)\n",
-    "\n",
-    "        (masked_lm_loss,\n",
-    "         masked_lm_example_loss, masked_lm_log_probs) = rp.get_masked_lm_output(\n",
-    "            bert_config, model.get_sequence_output(), model.get_embedding_table(),\n",
-    "            masked_lm_positions, masked_lm_ids, masked_lm_weights)\n",
-    "\n",
-    "        (next_sentence_loss, next_sentence_example_loss,\n",
-    "         next_sentence_log_probs) = rp.get_next_sentence_output(\n",
-    "            bert_config, model.get_pooled_output(), next_sentence_labels)\n",
-    "\n",
-    "        total_loss = masked_lm_loss + next_sentence_loss\n",
-    "\n",
-    "        tvars = tf.trainable_variables()\n",
-    "\n",
-    "        initialized_variable_names = {}\n",
-    "        scaffold_fn = None\n",
-    "        if init_checkpoint:\n",
-    "            (assignment_map,\n",
-    "             initialized_variable_names) = tfm.get_assigment_map_from_checkpoint(\n",
-    "                tvars, init_checkpoint)\n",
-    "            if use_tpu:\n",
-    "\n",
-    "                def tpu_scaffold():\n",
-    "                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "                    return tf.train.Scaffold()\n",
-    "\n",
-    "                scaffold_fn = tpu_scaffold\n",
-    "            else:\n",
-    "                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "\n",
-    "        tf.logging.info(\"**** Trainable Variables ****\")\n",
-    "        for var in tvars:\n",
-    "            init_string = \"\"\n",
-    "            if var.name in initialized_variable_names:\n",
-    "                init_string = \", *INIT_FROM_CKPT*\"\n",
-    "            tf.logging.info(\"  name = %s, shape = %s%s\", var.name, var.shape,\n",
-    "                            init_string)\n",
-    "\n",
-    "        output_spec = None\n",
-    "        if mode == tf.estimator.ModeKeys.TRAIN:\n",
-    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "            train_op = optimization.create_optimizer(\n",
-    "                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                train_op=train_op,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.EVAL:\n",
-    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
-    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
-    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
-    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
-    "\n",
-    "            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
-    "                          masked_lm_weights, next_sentence_example_loss,\n",
-    "                          next_sentence_log_probs, next_sentence_labels):\n",
-    "                \"\"\"Computes the loss and accuracy of the model.\"\"\"\n",
-    "                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
-    "                                                 [-1, masked_lm_log_probs.shape[-1]])\n",
-    "                masked_lm_predictions = tf.argmax(\n",
-    "                    masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
-    "                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])\n",
-    "                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])\n",
-    "                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])\n",
-    "                masked_lm_accuracy = tf.metrics.accuracy(\n",
-    "                    labels=masked_lm_ids,\n",
-    "                    predictions=masked_lm_predictions,\n",
-    "                    weights=masked_lm_weights)\n",
-    "                masked_lm_mean_loss = tf.metrics.mean(\n",
-    "                    values=masked_lm_example_loss, weights=masked_lm_weights)\n",
-    "\n",
-    "                next_sentence_log_probs = tf.reshape(\n",
-    "                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
-    "                next_sentence_predictions = tf.argmax(\n",
-    "                    next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
-    "                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])\n",
-    "                next_sentence_accuracy = tf.metrics.accuracy(\n",
-    "                    labels=next_sentence_labels, predictions=next_sentence_predictions)\n",
-    "                next_sentence_mean_loss = tf.metrics.mean(\n",
-    "                    values=next_sentence_example_loss)\n",
-    "\n",
-    "                return {\n",
-    "                    \"masked_lm_accuracy\": masked_lm_accuracy,\n",
-    "                    \"masked_lm_loss\": masked_lm_mean_loss,\n",
-    "                    \"next_sentence_accuracy\": next_sentence_accuracy,\n",
-    "                    \"next_sentence_loss\": next_sentence_mean_loss,\n",
-    "                }\n",
-    "\n",
-    "            eval_metrics = (metric_fn, [\n",
-    "                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
-    "                masked_lm_weights, next_sentence_example_loss,\n",
-    "                next_sentence_log_probs, next_sentence_labels\n",
-    "            ])\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                eval_metrics=eval_metrics,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-    "            masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
-    "                                                [-1, masked_lm_log_probs.shape[-1]])\n",
-    "            masked_lm_predictions = tf.argmax(\n",
-    "                masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
-    "\n",
-    "            next_sentence_log_probs = tf.reshape(\n",
-    "                next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
-    "            next_sentence_predictions = tf.argmax(\n",
-    "                next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
-    "\n",
-    "            masked_lm_predictions = tf.reshape(masked_lm_predictions,\n",
-    "                                                [1, masked_lm_positions.shape[-1]])\n",
-    "            next_sentence_predictions = tf.reshape(next_sentence_predictions,\n",
-    "                                                [1, 1])\n",
-    "\n",
-    "            predictions = {\n",
-    "                \"masked_lm_predictions\": masked_lm_predictions,\n",
-    "                \"next_sentence_predictions\": next_sentence_predictions\n",
-    "            }\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\n",
-    "            return output_spec\n",
-    "        else:\n",
-    "            raise ValueError(\"Only TRAIN, EVAL and PREDICT modes are supported: %s\" % (mode))\n",
-    "\n",
-    "        return output_spec\n",
-    "\n",
-    "    return model_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:40.328700Z",
-     "start_time": "2018-11-16T10:02:36.289676Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   _TPUContext: eval_on_tpu True\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - WARNING - tensorflow -   eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    master=None,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        num_shards=1,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    learning_rate=0,\n",
-    "    num_train_steps=1,\n",
-    "    num_warmup_steps=1,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU.\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "input_fn = input_fn_builder(\n",
-    "    features=features, seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq,\n",
-    "tokenizer=tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.596956Z",
-     "start_time": "2018-11-16T10:02:40.331008Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Running infer on CPU\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   Running infer on CPU\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Features ***\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -   *** Features ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = input_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = input_mask, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_mask, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_ids, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_ids, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_positions, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_positions, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = masked_lm_weights, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_weights, shape = (?, 20)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = next_sentence_labels, shape = (?, 1)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = next_sentence_labels, shape = (?, 1)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = segment_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:40 - INFO - tensorflow -     name = segment_ids, shape = (?, 128)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:**** Trainable Variables ****\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -   **** Trainable Variables ****\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Done calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:43 - INFO - tensorflow -   Done calling model_fn.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Graph was finalized.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:44 - INFO - tensorflow -   Graph was finalized.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:45 - INFO - tensorflow -   Running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Done running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:45 - INFO - tensorflow -   Done running local_init_op.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
-    "    tensorflow_all_out.append(result)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.634304Z",
-     "start_time": "2018-11-16T10:02:46.598800Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "dict_keys(['masked_lm_predictions', 'next_sentence_predictions'])\n",
-      "masked_lm_predictions [27227  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010\n",
-      "  1010  1010  1010  1010  1010  1010  1010  1010]\n",
-      "predicted token ['henson', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"masked_lm_predictions\", tensorflow_all_out[0]['masked_lm_predictions'])\n",
-    "print(\"predicted token\", tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:02:46.671229Z",
-     "start_time": "2018-11-16T10:02:46.637102Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensorflow_output: ['henson']\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_outputs = tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions'])[:len(masked_lm_positions)]\n",
-    "print(\"tensorflow_output:\", tensorflow_outputs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:03.556557Z",
-     "start_time": "2018-11-16T10:03:03.519654Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from examples import extract_features\n",
-    "from examples.extract_features import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:03.952710Z",
-     "start_time": "2018-11-16T10:03:03.921917Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.307673Z",
-     "start_time": "2018-11-16T10:03:04.439317Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"type_vocab_size\": 2,\n",
-      "  \"vocab_size\": 30522\n",
-      "}\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "BertForPreTraining(\n",
-       "  (bert): BertModel(\n",
-       "    (embeddings): BertEmbeddings(\n",
-       "      (word_embeddings): Embedding(30522, 768)\n",
-       "      (position_embeddings): Embedding(512, 768)\n",
-       "      (token_type_embeddings): Embedding(2, 768)\n",
-       "      (LayerNorm): BertLayerNorm()\n",
-       "      (dropout): Dropout(p=0.1)\n",
-       "    )\n",
-       "    (encoder): BertEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (pooler): BertPooler(\n",
-       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "      (activation): Tanh()\n",
-       "    )\n",
-       "  )\n",
-       "  (cls): BertPreTrainingHeads(\n",
-       "    (predictions): BertLMPredictionHead(\n",
-       "      (transform): BertPredictionHeadTransform(\n",
-       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "        (LayerNorm): BertLayerNorm()\n",
-       "      )\n",
-       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
-       "    )\n",
-       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = ppb.BertForPreTraining.from_pretrained('bert-base-uncased')\n",
-    "model.to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.351625Z",
-     "start_time": "2018-11-16T10:03:12.310736Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BertForPreTraining(\n",
-       "  (bert): BertModel(\n",
-       "    (embeddings): BertEmbeddings(\n",
-       "      (word_embeddings): Embedding(30522, 768)\n",
-       "      (position_embeddings): Embedding(512, 768)\n",
-       "      (token_type_embeddings): Embedding(2, 768)\n",
-       "      (LayerNorm): BertLayerNorm()\n",
-       "      (dropout): Dropout(p=0.1)\n",
-       "    )\n",
-       "    (encoder): BertEncoder(\n",
-       "      (layer): ModuleList(\n",
-       "        (0): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (1): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (2): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (3): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (4): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (5): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (6): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (7): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (8): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (9): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (10): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (11): BertLayer(\n",
-       "          (attention): BertAttention(\n",
-       "            (self): BertSelfAttention(\n",
-       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "            (output): BertSelfOutput(\n",
-       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "              (LayerNorm): BertLayerNorm()\n",
-       "              (dropout): Dropout(p=0.1)\n",
-       "            )\n",
-       "          )\n",
-       "          (intermediate): BertIntermediate(\n",
-       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          )\n",
-       "          (output): BertOutput(\n",
-       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "    (pooler): BertPooler(\n",
-       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "      (activation): Tanh()\n",
-       "    )\n",
-       "  )\n",
-       "  (cls): BertPreTrainingHeads(\n",
-       "    (predictions): BertLMPredictionHead(\n",
-       "      (transform): BertPredictionHeadTransform(\n",
-       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "        (LayerNorm): BertLayerNorm()\n",
-       "      )\n",
-       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
-       "    )\n",
-       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
-    "all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n",
-    "all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in features], dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_positions)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.792741Z",
-     "start_time": "2018-11-16T10:03:12.354253Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[ 2040,  2001,  3958, 27227,  1029,  3958,   103,  2001,  1037, 13997,\n",
-      "         11510,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
-      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "(1, 20, 30522)\n",
-      "[27227, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "pytorch_all_out = []\n",
-    "for input_ids, input_mask, segment_ids, tensor_masked_lm_positions in eval_dataloader:\n",
-    "    print(input_ids)\n",
-    "    print(input_mask)\n",
-    "    print(segment_ids)\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "    segment_ids = segment_ids.to(device)\n",
-    "\n",
-    "    prediction_scores, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)\n",
-    "    prediction_scores = prediction_scores[0, tensor_masked_lm_positions].detach().cpu().numpy()\n",
-    "    print(prediction_scores.shape)\n",
-    "    masked_lm_predictions = np.argmax(prediction_scores, axis=-1).squeeze().tolist()\n",
-    "    print(masked_lm_predictions)\n",
-    "    pytorch_all_out.append(masked_lm_predictions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-16T10:03:12.828439Z",
-     "start_time": "2018-11-16T10:03:12.795420Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "pytorch_output: ['henson']\n",
-      "tensorflow_output: ['henson']\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_outputs = tokenizer.convert_ids_to_tokens(pytorch_all_out[0])[:len(masked_lm_positions)]\n",
-    "print(\"pytorch_output:\", pytorch_outputs)\n",
-    "print(\"tensorflow_output:\", tensorflow_outputs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/server/transformers/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb b/server/transformers/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
deleted file mode 100644
index a75e052643f59bd80617f0682101267d1a0e134b..0000000000000000000000000000000000000000
--- a/server/transformers/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
+++ /dev/null
@@ -1,1644 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch model on the SQuAD task\n",
-    "\n",
-    "You can use this small notebook to check the loss computation from the TensorFlow model to the PyTorch model. In the following, we compare the total loss computed by the models starting from identical initializations (position prediction linear layers with weights at 1 and bias at 0).\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:33.636911Z",
-     "start_time": "2018-11-06T10:11:33.623091Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:33.651792Z",
-     "start_time": "2018-11-06T10:11:33.638984Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"../data/squad_data/train-v1.1.json\"\n",
-    "max_seq_length = 384\n",
-    "outside_pos = max_seq_length + 10\n",
-    "doc_stride = 128\n",
-    "max_query_length = 64\n",
-    "max_answer_length = 30\n",
-    "output_dir = \"/tmp/squad_base/\"\n",
-    "learning_rate = 3e-5"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:35.165788Z",
-     "start_time": "2018-11-06T10:11:33.653401Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/modeling.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['modeling_tensorflow'] = module\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_bert_squad.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['run_squad_tensorflow'] = module\n",
-    "import modeling_tensorflow\n",
-    "from run_squad_tensorflow import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.494391Z",
-     "start_time": "2018-11-06T10:11:35.168615Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000000\n",
-      "INFO:tensorflow:example_index: 0\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] to whom did the virgin mary allegedly appear in 1858 in lou ##rdes france ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\n",
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\n",
-      "INFO:tensorflow:input_ids: 101 2000 3183 2106 1996 6261 2984 9382 3711 1999 8517 1999 10223 26371 2605 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 130\n",
-      "INFO:tensorflow:end_position: 137\n",
-      "INFO:tensorflow:answer: saint bern ##ade ##tte so ##ub ##iro ##us\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000001\n",
-      "INFO:tensorflow:example_index: 1\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is in front of the notre dame main building ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:0 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:6 23:7 24:8 25:9 26:10 27:10 28:10 29:11 30:12 31:13 32:14 33:15 34:16 35:17 36:18 37:19 38:20 39:20 40:21 41:22 42:23 43:24 44:25 45:26 46:27 47:28 48:29 49:30 50:30 51:31 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:39 60:39 61:39 62:40 63:41 64:42 65:43 66:43 67:43 68:43 69:44 70:45 71:46 72:46 73:46 74:46 75:47 76:48 77:49 78:50 79:51 80:52 81:53 82:54 83:55 84:56 85:57 86:58 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:65 96:65 97:66 98:67 99:68 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:77 110:78 111:79 112:79 113:80 114:81 115:81 116:81 117:82 118:83 119:84 120:85 121:86 122:87 123:87 124:88 125:89 126:90 127:91 128:91 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:94 137:95 138:96 139:97 140:98 141:99 142:100 143:101 144:102 145:102 146:103 147:104 148:105 149:106 150:107 151:108 152:109 153:110 154:111 155:112 156:113 157:114 158:115 159:115 160:115 161:116 162:117 163:118 164:118 165:119 166:120 167:121 168:122 169:123 170:123\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1999 2392 1997 1996 10289 8214 2364 2311 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 52\n",
-      "INFO:tensorflow:end_position: 56\n",
-      "INFO:tensorflow:answer: a copper statue of christ\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000002\n",
-      "INFO:tensorflow:example_index: 2\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] the basilica of the sacred heart at notre dame is beside to which structure ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:0 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:6 27:7 28:8 29:9 30:10 31:10 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:27 51:28 52:29 53:30 54:30 55:31 56:32 57:33 58:34 59:35 60:36 61:37 62:38 63:39 64:39 65:39 66:40 67:41 68:42 69:43 70:43 71:43 72:43 73:44 74:45 75:46 76:46 77:46 78:46 79:47 80:48 81:49 82:50 83:51 84:52 85:53 86:54 87:55 88:56 89:57 90:58 91:58 92:59 93:60 94:61 95:62 96:63 97:64 98:65 99:65 100:65 101:66 102:67 103:68 104:69 105:70 106:71 107:72 108:72 109:73 110:74 111:75 112:76 113:77 114:78 115:79 116:79 117:80 118:81 119:81 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:92 136:92 137:92 138:93 139:94 140:94 141:95 142:96 143:97 144:98 145:99 146:100 147:101 148:102 149:102 150:103 151:104 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:115 164:115 165:116 166:117 167:118 168:118 169:119 170:120 171:121 172:122 173:123 174:123\n",
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True\n",
-      "INFO:tensorflow:input_ids: 101 1996 13546 1997 1996 6730 2540 2012 10289 8214 2003 3875 2000 2029 3252 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 81\n",
-      "INFO:tensorflow:end_position: 83\n",
-      "INFO:tensorflow:answer: the main building\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000003\n",
-      "INFO:tensorflow:example_index: 3\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the gr ##otto at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:0 13:0 14:1 15:2 16:3 17:4 18:5 19:6 20:6 21:7 22:8 23:9 24:10 25:10 26:10 27:11 28:12 29:13 30:14 31:15 32:16 33:17 34:18 35:19 36:20 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:27 45:28 46:29 47:30 48:30 49:31 50:32 51:33 52:34 53:35 54:36 55:37 56:38 57:39 58:39 59:39 60:40 61:41 62:42 63:43 64:43 65:43 66:43 67:44 68:45 69:46 70:46 71:46 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:58 86:59 87:60 88:61 89:62 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:77 108:78 109:79 110:79 111:80 112:81 113:81 114:81 115:82 116:83 117:84 118:85 119:86 120:87 121:87 122:88 123:89 124:90 125:91 126:91 127:91 128:92 129:92 130:92 131:92 132:93 133:94 134:94 135:95 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:102 144:103 145:104 146:105 147:106 148:107 149:108 150:109 151:110 152:111 153:112 154:113 155:114 156:115 157:115 158:115 159:116 160:117 161:118 162:118 163:119 164:120 165:121 166:122 167:123 168:123\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 24665 23052 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 95\n",
-      "INFO:tensorflow:end_position: 101\n",
-      "INFO:tensorflow:answer: a marian place of prayer and reflection\n",
-      "INFO:tensorflow:*** Example ***\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:unique_id: 1000000004\n",
-      "INFO:tensorflow:example_index: 4\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what sits on top of the main building at notre dame ? [SEP] architectural ##ly , the school has a catholic character . atop the main building ' s gold dome is a golden statue of the virgin mary . immediately in front of the main building and facing it , is a copper statue of christ with arms up ##rai ##sed with the legend \" ve ##ni ##te ad me om ##nes \" . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a marian place of prayer and reflection . it is a replica of the gr ##otto at lou ##rdes , france where the virgin mary reputed ##ly appeared to saint bern ##ade ##tte so ##ub ##iro ##us in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mary . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:0 16:0 17:1 18:2 19:3 20:4 21:5 22:6 23:6 24:7 25:8 26:9 27:10 28:10 29:10 30:11 31:12 32:13 33:14 34:15 35:16 36:17 37:18 38:19 39:20 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:28 49:29 50:30 51:30 52:31 53:32 54:33 55:34 56:35 57:36 58:37 59:38 60:39 61:39 62:39 63:40 64:41 65:42 66:43 67:43 68:43 69:43 70:44 71:45 72:46 73:46 74:46 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:58 89:59 90:60 91:61 92:62 93:63 94:64 95:65 96:65 97:65 98:66 99:67 100:68 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:77 111:78 112:79 113:79 114:80 115:81 116:81 117:81 118:82 119:83 120:84 121:85 122:86 123:87 124:87 125:88 126:89 127:90 128:91 129:91 130:91 131:92 132:92 133:92 134:92 135:93 136:94 137:94 138:95 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:102 147:103 148:104 149:105 150:106 151:107 152:108 153:109 154:110 155:111 156:112 157:113 158:114 159:115 160:115 161:115 162:116 163:117 164:118 165:118 166:119 167:120 168:121 169:122 170:123 171:123\n",
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 7719 2006 2327 1997 1996 2364 2311 2012 10289 8214 1029 102 6549 2135 1010 1996 2082 2038 1037 3234 2839 1012 10234 1996 2364 2311 1005 1055 2751 8514 2003 1037 3585 6231 1997 1996 6261 2984 1012 3202 1999 2392 1997 1996 2364 2311 1998 5307 2009 1010 2003 1037 6967 6231 1997 4828 2007 2608 2039 14995 6924 2007 1996 5722 1000 2310 3490 2618 4748 2033 18168 5267 1000 1012 2279 2000 1996 2364 2311 2003 1996 13546 1997 1996 6730 2540 1012 3202 2369 1996 13546 2003 1996 24665 23052 1010 1037 14042 2173 1997 7083 1998 9185 1012 2009 2003 1037 15059 1997 1996 24665 23052 2012 10223 26371 1010 2605 2073 1996 6261 2984 22353 2135 2596 2000 3002 16595 9648 4674 2061 12083 9711 2271 1999 8517 1012 2012 1996 2203 1997 1996 2364 3298 1006 1998 1999 1037 3622 2240 2008 8539 2083 1017 11342 1998 1996 2751 8514 1007 1010 2003 1037 3722 1010 2715 2962 6231 1997 2984 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 33\n",
-      "INFO:tensorflow:end_position: 39\n",
-      "INFO:tensorflow:answer: a golden statue of the virgin mary\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000005\n",
-      "INFO:tensorflow:example_index: 5\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] when did the scholastic magazine of notre dame begin publishing ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2043 2106 1996 24105 2932 1997 10289 8214 4088 4640 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 63\n",
-      "INFO:tensorflow:end_position: 64\n",
-      "INFO:tensorflow:answer: september 1876\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000006\n",
-      "INFO:tensorflow:example_index: 6\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how often is notre dame ' s the jug ##gler published ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:4 20:5 21:6 22:6 23:6 24:7 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:14 33:15 34:16 35:17 36:17 37:17 38:18 39:19 40:20 41:21 42:21 43:22 44:23 45:24 46:25 47:26 48:27 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:36 61:36 62:37 63:38 64:39 65:40 66:40 67:41 68:42 69:43 70:44 71:45 72:46 73:47 74:48 75:49 76:50 77:51 78:52 79:53 80:54 81:55 82:56 83:57 84:58 85:59 86:60 87:60 88:61 89:62 90:63 91:63 92:64 93:65 94:65 95:65 96:66 97:67 98:68 99:69 100:70 101:71 102:72 103:73 104:74 105:75 106:76 107:77 108:77 109:78 110:79 111:80 112:81 113:82 114:83 115:83 116:84 117:85 118:86 119:87 120:88 121:89 122:89 123:90 124:91 125:92 126:93 127:94 128:95 129:96 130:97 131:98 132:99 133:100 134:101 135:101 136:102 137:103 138:104 139:105 140:106 141:107 142:108 143:109 144:110 145:111 146:112 147:112 148:112 149:113 150:113 151:114 152:115 153:116 154:117 155:118 156:118 157:119 158:120 159:121 160:122 161:123 162:124 163:125 164:126 165:127 166:128 167:129 168:130 169:131 170:132 171:133 172:134 173:135 174:136 175:137 176:138 177:138 178:139 179:140 180:140 181:141 182:142 183:143 184:144 185:145 186:146 187:147 188:148 189:149 190:150 191:151 192:152 193:153 194:153 195:154 196:155 197:156 198:156 199:157 200:158 201:159 202:160 203:160 204:161 205:161 206:162 207:163 208:163 209:164 210:165 211:166 212:167 213:168 214:169 215:170 216:171 217:172 218:173 219:174 220:174 221:175 222:176 223:177 224:178 225:179 226:180 227:181 228:182 229:182 230:183 231:184 232:185 233:186 234:187 235:188 236:189 237:190 238:191 239:191 240:192 241:192 242:193 243:194 244:195 245:196 246:197 247:198 248:199 249:199 250:200 251:200 252:201 253:202 254:203 255:204 256:205 257:206 258:207 259:208 260:209 261:210 262:210 263:211 264:212 265:212 266:213 267:214 268:215 269:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2411 2003 10289 8214 1005 1055 1996 26536 17420 2405 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 98\n",
-      "INFO:tensorflow:end_position: 98\n",
-      "INFO:tensorflow:answer: twice\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000007\n",
-      "INFO:tensorflow:example_index: 7\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the daily student paper at notre dame called ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 3679 3076 3259 2012 10289 8214 2170 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 123\n",
-      "INFO:tensorflow:end_position: 124\n",
-      "INFO:tensorflow:answer: the observer\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000008\n",
-      "INFO:tensorflow:example_index: 8\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many student news papers are found at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:4 19:5 20:6 21:6 22:6 23:7 24:8 25:9 26:10 27:11 28:12 29:13 30:14 31:14 32:15 33:16 34:17 35:17 36:17 37:18 38:19 39:20 40:21 41:21 42:22 43:23 44:24 45:25 46:26 47:27 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:36 60:36 61:37 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:55 81:56 82:57 83:58 84:59 85:60 86:60 87:61 88:62 89:63 90:63 91:64 92:65 93:65 94:65 95:66 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:75 105:76 106:77 107:77 108:78 109:79 110:80 111:81 112:82 113:83 114:83 115:84 116:85 117:86 118:87 119:88 120:89 121:89 122:90 123:91 124:92 125:93 126:94 127:95 128:96 129:97 130:98 131:99 132:100 133:101 134:101 135:102 136:103 137:104 138:105 139:106 140:107 141:108 142:109 143:110 144:111 145:112 146:112 147:112 148:113 149:113 150:114 151:115 152:116 153:117 154:118 155:118 156:119 157:120 158:121 159:122 160:123 161:124 162:125 163:126 164:127 165:128 166:129 167:130 168:131 169:132 170:133 171:134 172:135 173:136 174:137 175:138 176:138 177:139 178:140 179:140 180:141 181:142 182:143 183:144 184:145 185:146 186:147 187:148 188:149 189:150 190:151 191:152 192:153 193:153 194:154 195:155 196:156 197:156 198:157 199:158 200:159 201:160 202:160 203:161 204:161 205:162 206:163 207:163 208:164 209:165 210:166 211:167 212:168 213:169 214:170 215:171 216:172 217:173 218:174 219:174 220:175 221:176 222:177 223:178 224:179 225:180 226:181 227:182 228:182 229:183 230:184 231:185 232:186 233:187 234:188 235:189 236:190 237:191 238:191 239:192 240:192 241:193 242:194 243:195 244:196 245:197 246:198 247:199 248:199 249:200 250:200 251:201 252:202 253:203 254:204 255:205 256:206 257:207 258:208 259:209 260:210 261:210 262:211 263:212 264:212 265:213 266:214 267:215 268:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 3076 2739 4981 2024 2179 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 39\n",
-      "INFO:tensorflow:end_position: 39\n",
-      "INFO:tensorflow:answer: three\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000009\n",
-      "INFO:tensorflow:example_index: 9\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] in what year did the student paper common sense begin publication at notre dame ? [SEP] as at most other universities , notre dame ' s students run a number of news media outlets . the nine student - run outlets include three newspapers , both a radio and television station , and several magazines and journals . begun as a one - page journal in september 1876 , the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states . the other magazine , the jug ##gler , is released twice a year and focuses on student literature and artwork . the dome yearbook is published annually . the newspapers have varying publication interests , with the observer published daily and mainly reporting university and other news , and staffed by students from both notre dame and saint mary ' s college . unlike scholastic and the dome , the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university . in 1987 , when some students believed that the observer began to show a conservative bias , a liberal newspaper , common sense was published . likewise , in 2003 , when other students believed that the paper showed a liberal bias , the conservative paper irish rover went into production . neither paper is published as often as the observer ; however , all three are distributed to all students . finally , in spring 2008 an undergraduate journal for political science research , beyond politics , made its debut . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 17:0 18:1 19:2 20:3 21:4 22:4 23:5 24:6 25:6 26:6 27:7 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:14 36:15 37:16 38:17 39:17 40:17 41:18 42:19 43:20 44:21 45:21 46:22 47:23 48:24 49:25 50:26 51:27 52:27 53:28 54:29 55:30 56:31 57:32 58:32 59:33 60:34 61:35 62:36 63:36 64:36 65:37 66:38 67:39 68:40 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:50 80:51 81:52 82:53 83:54 84:55 85:56 86:57 87:58 88:59 89:60 90:60 91:61 92:62 93:63 94:63 95:64 96:65 97:65 98:65 99:66 100:67 101:68 102:69 103:70 104:71 105:72 106:73 107:74 108:75 109:76 110:77 111:77 112:78 113:79 114:80 115:81 116:82 117:83 118:83 119:84 120:85 121:86 122:87 123:88 124:89 125:89 126:90 127:91 128:92 129:93 130:94 131:95 132:96 133:97 134:98 135:99 136:100 137:101 138:101 139:102 140:103 141:104 142:105 143:106 144:107 145:108 146:109 147:110 148:111 149:112 150:112 151:112 152:113 153:113 154:114 155:115 156:116 157:117 158:118 159:118 160:119 161:120 162:121 163:122 164:123 165:124 166:125 167:126 168:127 169:128 170:129 171:130 172:131 173:132 174:133 175:134 176:135 177:136 178:137 179:138 180:138 181:139 182:140 183:140 184:141 185:142 186:143 187:144 188:145 189:146 190:147 191:148 192:149 193:150 194:151 195:152 196:153 197:153 198:154 199:155 200:156 201:156 202:157 203:158 204:159 205:160 206:160 207:161 208:161 209:162 210:163 211:163 212:164 213:165 214:166 215:167 216:168 217:169 218:170 219:171 220:172 221:173 222:174 223:174 224:175 225:176 226:177 227:178 228:179 229:180 230:181 231:182 232:182 233:183 234:184 235:185 236:186 237:187 238:188 239:189 240:190 241:191 242:191 243:192 244:192 245:193 246:194 247:195 248:196 249:197 250:198 251:199 252:199 253:200 254:200 255:201 256:202 257:203 258:204 259:205 260:206 261:207 262:208 263:209 264:210 265:210 266:211 267:212 268:212 269:213 270:214 271:215 272:215\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True 170:True 171:True 172:True 173:True 174:True 175:True 176:True 177:True 178:True 179:True 180:True 181:True 182:True 183:True 184:True 185:True 186:True 187:True 188:True 189:True 190:True 191:True 192:True 193:True 194:True 195:True 196:True 197:True 198:True 199:True 200:True 201:True 202:True 203:True 204:True 205:True 206:True 207:True 208:True 209:True 210:True 211:True 212:True 213:True 214:True 215:True 216:True 217:True 218:True 219:True 220:True 221:True 222:True 223:True 224:True 225:True 226:True 227:True 228:True 229:True 230:True 231:True 232:True 233:True 234:True 235:True 236:True 237:True 238:True 239:True 240:True 241:True 242:True 243:True 244:True 245:True 246:True 247:True 248:True 249:True 250:True 251:True 252:True 253:True 254:True 255:True 256:True 257:True 258:True 259:True 260:True 261:True 262:True 263:True 264:True 265:True 266:True 267:True 268:True 269:True 270:True 271:True 272:True\n",
-      "INFO:tensorflow:input_ids: 101 1999 2054 2095 2106 1996 3076 3259 2691 3168 4088 4772 2012 10289 8214 1029 102 2004 2012 2087 2060 5534 1010 10289 8214 1005 1055 2493 2448 1037 2193 1997 2739 2865 11730 1012 1996 3157 3076 1011 2448 11730 2421 2093 6399 1010 2119 1037 2557 1998 2547 2276 1010 1998 2195 7298 1998 9263 1012 5625 2004 1037 2028 1011 3931 3485 1999 2244 7326 1010 1996 24105 2932 2003 3843 3807 7058 1998 4447 2000 2022 1996 4587 7142 9234 4772 1999 1996 2142 2163 1012 1996 2060 2932 1010 1996 26536 17420 1010 2003 2207 3807 1037 2095 1998 7679 2006 3076 3906 1998 8266 1012 1996 8514 24803 2003 2405 6604 1012 1996 6399 2031 9671 4772 5426 1010 2007 1996 9718 2405 3679 1998 3701 7316 2118 1998 2060 2739 1010 1998 21121 2011 2493 2013 2119 10289 8214 1998 3002 2984 1005 1055 2267 1012 4406 24105 1998 1996 8514 1010 1996 9718 2003 2019 2981 4772 1998 2515 2025 2031 1037 4513 8619 2030 2151 8368 15709 2013 1996 2118 1012 1999 3055 1010 2043 2070 2493 3373 2008 1996 9718 2211 2000 2265 1037 4603 13827 1010 1037 4314 3780 1010 2691 3168 2001 2405 1012 10655 1010 1999 2494 1010 2043 2060 2493 3373 2008 1996 3259 3662 1037 4314 13827 1010 1996 4603 3259 3493 13631 2253 2046 2537 1012 4445 3259 2003 2405 2004 2411 2004 1996 9718 1025 2174 1010 2035 2093 2024 5500 2000 2035 2493 1012 2633 1010 1999 3500 2263 2019 8324 3485 2005 2576 2671 2470 1010 3458 4331 1010 2081 2049 2834 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 182\n",
-      "INFO:tensorflow:end_position: 182\n",
-      "INFO:tensorflow:answer: 1987\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000010\n",
-      "INFO:tensorflow:example_index: 10\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] where is the headquarters of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 14:0 15:1 16:2 17:3 18:4 19:5 20:6 21:7 22:8 23:9 24:10 25:11 26:12 27:12 28:13 29:14 30:15 31:16 32:16 33:17 34:18 35:19 36:20 37:20 38:20 39:21 40:22 41:23 42:23 43:24 44:24 45:25 46:25 47:26 48:27 49:28 50:29 51:30 52:31 53:32 54:32 55:33 56:34 57:35 58:36 59:37 60:38 61:38 62:39 63:40 64:40 65:41 66:42 67:43 68:44 69:45 70:46 71:47 72:48 73:49 74:50 75:51 76:52 77:52 78:53 79:54 80:54 81:55 82:56 83:57 84:57 85:57 86:58 87:59 88:60 89:61 90:62 91:63 92:64 93:65 94:66 95:66 96:67 97:68 98:69 99:69 100:69 101:70 102:71 103:72 104:72 105:73 106:74 107:75 108:76 109:76 110:76 111:77 112:78 113:79 114:80 115:80 116:80 117:81 118:82 119:83 120:84 121:85 122:85 123:86 124:87 125:88 126:89 127:90 128:91 129:92 130:92 131:92 132:92 133:93 134:94 135:95 136:95 137:96 138:96 139:96 140:97 141:98 142:99 143:100 144:101 145:102 146:103 147:104 148:104 149:105 150:106 151:107 152:108 153:108 154:108 155:109 156:110 157:111 158:111\n",
-      "INFO:tensorflow:token_is_max_context: 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_ids: 101 2073 2003 1996 4075 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 36\n",
-      "INFO:tensorflow:end_position: 36\n",
-      "INFO:tensorflow:answer: rome\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000011\n",
-      "INFO:tensorflow:example_index: 11\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the primary seminary of the congregation of the holy cross ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:8 24:9 25:10 26:11 27:12 28:12 29:13 30:14 31:15 32:16 33:16 34:17 35:18 36:19 37:20 38:20 39:20 40:21 41:22 42:23 43:23 44:24 45:24 46:25 47:25 48:26 49:27 50:28 51:29 52:30 53:31 54:32 55:32 56:33 57:34 58:35 59:36 60:37 61:38 62:38 63:39 64:40 65:40 66:41 67:42 68:43 69:44 70:45 71:46 72:47 73:48 74:49 75:50 76:51 77:52 78:52 79:53 80:54 81:54 82:55 83:56 84:57 85:57 86:57 87:58 88:59 89:60 90:61 91:62 92:63 93:64 94:65 95:66 96:66 97:67 98:68 99:69 100:69 101:69 102:70 103:71 104:72 105:72 106:73 107:74 108:75 109:76 110:76 111:76 112:77 113:78 114:79 115:80 116:80 117:80 118:81 119:82 120:83 121:84 122:85 123:85 124:86 125:87 126:88 127:89 128:90 129:91 130:92 131:92 132:92 133:92 134:93 135:94 136:95 137:95 138:96 139:96 140:96 141:97 142:98 143:99 144:100 145:101 146:102 147:103 148:104 149:104 150:105 151:106 152:107 153:108 154:108 155:108 156:109 157:110 158:111 159:111\n",
-      "INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 3078 8705 1997 1996 7769 1997 1996 4151 2892 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 44\n",
-      "INFO:tensorflow:end_position: 46\n",
-      "INFO:tensorflow:answer: more ##au seminary\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000012\n",
-      "INFO:tensorflow:example_index: 12\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what is the oldest structure at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 2003 1996 4587 3252 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 59\n",
-      "INFO:tensorflow:end_position: 60\n",
-      "INFO:tensorflow:answer: old college\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000013\n",
-      "INFO:tensorflow:example_index: 13\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] what individuals live at fatima house at notre dame ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 12:0 13:1 14:2 15:3 16:4 17:5 18:6 19:7 20:8 21:9 22:10 23:11 24:12 25:12 26:13 27:14 28:15 29:16 30:16 31:17 32:18 33:19 34:20 35:20 36:20 37:21 38:22 39:23 40:23 41:24 42:24 43:25 44:25 45:26 46:27 47:28 48:29 49:30 50:31 51:32 52:32 53:33 54:34 55:35 56:36 57:37 58:38 59:38 60:39 61:40 62:40 63:41 64:42 65:43 66:44 67:45 68:46 69:47 70:48 71:49 72:50 73:51 74:52 75:52 76:53 77:54 78:54 79:55 80:56 81:57 82:57 83:57 84:58 85:59 86:60 87:61 88:62 89:63 90:64 91:65 92:66 93:66 94:67 95:68 96:69 97:69 98:69 99:70 100:71 101:72 102:72 103:73 104:74 105:75 106:76 107:76 108:76 109:77 110:78 111:79 112:80 113:80 114:80 115:81 116:82 117:83 118:84 119:85 120:85 121:86 122:87 123:88 124:89 125:90 126:91 127:92 128:92 129:92 130:92 131:93 132:94 133:95 134:95 135:96 136:96 137:96 138:97 139:98 140:99 141:100 142:101 143:102 144:103 145:104 146:104 147:105 148:106 149:107 150:108 151:108 152:108 153:109 154:110 155:111 156:111\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_is_max_context: 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\n",
-      "INFO:tensorflow:input_ids: 101 2054 3633 2444 2012 27596 2160 2012 10289 8214 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 84\n",
-      "INFO:tensorflow:end_position: 87\n",
-      "INFO:tensorflow:answer: retired priests and brothers\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000014\n",
-      "INFO:tensorflow:example_index: 14\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] which prize did frederick bu ##ech ##ner create ? [SEP] the university is the major seat of the congregation of holy cross ( albeit not its official headquarters , which are in rome ) . its main seminary , more ##au seminary , is located on the campus across st . joseph lake from the main building . old college , the oldest building on campus and located near the shore of st . mary lake , houses undergraduate seminar ##ians . retired priests and brothers reside in fatima house ( a former retreat center ) , holy cross house , as well as col ##umb ##a hall near the gr ##otto . the university through the more ##au seminary has ties to theologian frederick bu ##ech ##ner . while not catholic , bu ##ech ##ner has praised writers from notre dame and more ##au seminary created a bu ##ech ##ner prize for preaching . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:6 18:7 19:8 20:9 21:10 22:11 23:12 24:12 25:13 26:14 27:15 28:16 29:16 30:17 31:18 32:19 33:20 34:20 35:20 36:21 37:22 38:23 39:23 40:24 41:24 42:25 43:25 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:32 52:33 53:34 54:35 55:36 56:37 57:38 58:38 59:39 60:40 61:40 62:41 63:42 64:43 65:44 66:45 67:46 68:47 69:48 70:49 71:50 72:51 73:52 74:52 75:53 76:54 77:54 78:55 79:56 80:57 81:57 82:57 83:58 84:59 85:60 86:61 87:62 88:63 89:64 90:65 91:66 92:66 93:67 94:68 95:69 96:69 97:69 98:70 99:71 100:72 101:72 102:73 103:74 104:75 105:76 106:76 107:76 108:77 109:78 110:79 111:80 112:80 113:80 114:81 115:82 116:83 117:84 118:85 119:85 120:86 121:87 122:88 123:89 124:90 125:91 126:92 127:92 128:92 129:92 130:93 131:94 132:95 133:95 134:96 135:96 136:96 137:97 138:98 139:99 140:100 141:101 142:102 143:103 144:104 145:104 146:105 147:106 148:107 149:108 150:108 151:108 152:109 153:110 154:111 155:111\n",
-      "INFO:tensorflow:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 2029 3396 2106 5406 20934 15937 3678 3443 1029 102 1996 2118 2003 1996 2350 2835 1997 1996 7769 1997 4151 2892 1006 12167 2025 2049 2880 4075 1010 2029 2024 1999 4199 1007 1012 2049 2364 8705 1010 2062 4887 8705 1010 2003 2284 2006 1996 3721 2408 2358 1012 3312 2697 2013 1996 2364 2311 1012 2214 2267 1010 1996 4587 2311 2006 3721 1998 2284 2379 1996 5370 1997 2358 1012 2984 2697 1010 3506 8324 18014 7066 1012 3394 8656 1998 3428 13960 1999 27596 2160 1006 1037 2280 7822 2415 1007 1010 4151 2892 2160 1010 2004 2092 2004 8902 25438 2050 2534 2379 1996 24665 23052 1012 1996 2118 2083 1996 2062 4887 8705 2038 7208 2000 17200 5406 20934 15937 3678 1012 2096 2025 3234 1010 20934 15937 3678 2038 5868 4898 2013 10289 8214 1998 2062 4887 8705 2580 1037 20934 15937 3678 3396 2005 17979 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 149\n",
-      "INFO:tensorflow:end_position: 154\n",
-      "INFO:tensorflow:answer: bu ##ech ##ner prize for preaching\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000015\n",
-      "INFO:tensorflow:example_index: 15\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many bs level degrees are offered in the college of engineering at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 18:0 19:1 20:2 21:3 22:4 23:5 24:6 25:7 26:7 27:8 28:8 29:9 30:10 31:11 32:12 33:13 34:14 35:15 36:16 37:17 38:18 39:19 40:20 41:21 42:22 43:23 44:24 45:25 46:26 47:26 48:27 49:28 50:29 51:29 52:30 53:31 54:32 55:33 56:33 57:34 58:34 59:34 60:35 61:36 62:36 63:36 64:36 65:36 66:36 67:36 68:37 69:38 70:39 71:39 72:40 73:41 74:42 75:43 76:44 77:45 78:46 79:47 80:48 81:49 82:49 83:50 84:51 85:52 86:52 87:52 88:52 89:53 90:53 91:54 92:55 93:56 94:57 95:58 96:58 97:59 98:60 99:61 100:62 101:62 102:63 103:64 104:65 105:66 106:67 107:68 108:69 109:69 110:69 111:69 112:70 113:71 114:71 115:72 116:72 117:73 118:74 119:75 120:76 121:76 122:76 123:77 124:78 125:79 126:80 127:81 128:82 129:83 130:84 131:85 132:86 133:87 134:88 135:89 136:90 137:91 138:92 139:92 140:92 141:92 142:93 143:94 144:95 145:96 146:97 147:98 148:98 149:98 150:99 151:99 152:100 153:100\n",
-      "INFO:tensorflow:token_is_max_context: 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 18667 2504 5445 2024 3253 1999 1996 2267 1997 3330 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 107\n",
-      "INFO:tensorflow:end_position: 107\n",
-      "INFO:tensorflow:answer: eight\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000016\n",
-      "INFO:tensorflow:example_index: 16\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] in what year was the college of engineering at notre dame formed ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_to_orig_map: 15:0 16:1 17:2 18:3 19:4 20:5 21:6 22:7 23:7 24:8 25:8 26:9 27:10 28:11 29:12 30:13 31:14 32:15 33:16 34:17 35:18 36:19 37:20 38:21 39:22 40:23 41:24 42:25 43:26 44:26 45:27 46:28 47:29 48:29 49:30 50:31 51:32 52:33 53:33 54:34 55:34 56:34 57:35 58:36 59:36 60:36 61:36 62:36 63:36 64:36 65:37 66:38 67:39 68:39 69:40 70:41 71:42 72:43 73:44 74:45 75:46 76:47 77:48 78:49 79:49 80:50 81:51 82:52 83:52 84:52 85:52 86:53 87:53 88:54 89:55 90:56 91:57 92:58 93:58 94:59 95:60 96:61 97:62 98:62 99:63 100:64 101:65 102:66 103:67 104:68 105:69 106:69 107:69 108:69 109:70 110:71 111:71 112:72 113:72 114:73 115:74 116:75 117:76 118:76 119:76 120:77 121:78 122:79 123:80 124:81 125:82 126:83 127:84 128:85 129:86 130:87 131:88 132:89 133:90 134:91 135:92 136:92 137:92 138:92 139:93 140:94 141:95 142:96 143:97 144:98 145:98 146:98 147:99 148:99 149:100 150:100\n",
-      "INFO:tensorflow:token_is_max_context: 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True\n",
-      "INFO:tensorflow:input_ids: 101 1999 2054 2095 2001 1996 2267 1997 3330 2012 10289 8214 2719 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 22\n",
-      "INFO:tensorflow:end_position: 22\n",
-      "INFO:tensorflow:answer: 1920\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000017\n",
-      "INFO:tensorflow:example_index: 17\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] before the creation of the college of engineering similar studies were carried out at which notre dame college ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 21:0 22:1 23:2 24:3 25:4 26:5 27:6 28:7 29:7 30:8 31:8 32:9 33:10 34:11 35:12 36:13 37:14 38:15 39:16 40:17 41:18 42:19 43:20 44:21 45:22 46:23 47:24 48:25 49:26 50:26 51:27 52:28 53:29 54:29 55:30 56:31 57:32 58:33 59:33 60:34 61:34 62:34 63:35 64:36 65:36 66:36 67:36 68:36 69:36 70:36 71:37 72:38 73:39 74:39 75:40 76:41 77:42 78:43 79:44 80:45 81:46 82:47 83:48 84:49 85:49 86:50 87:51 88:52 89:52 90:52 91:52 92:53 93:53 94:54 95:55 96:56 97:57 98:58 99:58 100:59 101:60 102:61 103:62 104:62 105:63 106:64 107:65 108:66 109:67 110:68 111:69 112:69 113:69 114:69 115:70 116:71 117:71 118:72 119:72 120:73 121:74 122:75 123:76 124:76 125:76 126:77 127:78 128:79 129:80 130:81 131:82 132:83 133:84 134:85 135:86 136:87 137:88 138:89 139:90 140:91 141:92 142:92 143:92 144:92 145:93 146:94 147:95 148:96 149:97 150:98 151:98 152:98 153:99 154:99 155:100 156:100\n",
-      "INFO:tensorflow:token_is_max_context: 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True\n",
-      "INFO:tensorflow:input_ids: 101 2077 1996 4325 1997 1996 2267 1997 3330 2714 2913 2020 3344 2041 2012 2029 10289 8214 2267 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 43\n",
-      "INFO:tensorflow:end_position: 46\n",
-      "INFO:tensorflow:answer: the college of science\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000018\n",
-      "INFO:tensorflow:example_index: 18\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] how many departments are within the st ##ins ##on - re ##mic ##k hall of engineering ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n",
-      "INFO:tensorflow:token_to_orig_map: 19:0 20:1 21:2 22:3 23:4 24:5 25:6 26:7 27:7 28:8 29:8 30:9 31:10 32:11 33:12 34:13 35:14 36:15 37:16 38:17 39:18 40:19 41:20 42:21 43:22 44:23 45:24 46:25 47:26 48:26 49:27 50:28 51:29 52:29 53:30 54:31 55:32 56:33 57:33 58:34 59:34 60:34 61:35 62:36 63:36 64:36 65:36 66:36 67:36 68:36 69:37 70:38 71:39 72:39 73:40 74:41 75:42 76:43 77:44 78:45 79:46 80:47 81:48 82:49 83:49 84:50 85:51 86:52 87:52 88:52 89:52 90:53 91:53 92:54 93:55 94:56 95:57 96:58 97:58 98:59 99:60 100:61 101:62 102:62 103:63 104:64 105:65 106:66 107:67 108:68 109:69 110:69 111:69 112:69 113:70 114:71 115:71 116:72 117:72 118:73 119:74 120:75 121:76 122:76 123:76 124:77 125:78 126:79 127:80 128:81 129:82 130:83 131:84 132:85 133:86 134:87 135:88 136:89 137:90 138:91 139:92 140:92 141:92 142:92 143:93 144:94 145:95 146:96 147:97 148:98 149:98 150:98 151:99 152:99 153:100 154:100\n",
-      "INFO:tensorflow:token_is_max_context: 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True\n",
-      "INFO:tensorflow:input_ids: 101 2129 2116 7640 2024 2306 1996 2358 7076 2239 1011 2128 7712 2243 2534 1997 3330 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 74\n",
-      "INFO:tensorflow:end_position: 74\n",
-      "INFO:tensorflow:answer: five\n",
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 1000000019\n",
-      "INFO:tensorflow:example_index: 19\n",
-      "INFO:tensorflow:doc_span_index: 0\n",
-      "INFO:tensorflow:tokens: [CLS] the college of science began to offer civil engineering courses beginning at what time at notre dame ? [SEP] the college of engineering was established in 1920 , however , early courses in civil and mechanical engineering were a part of the college of science since the 1870s . today the college , housed in the fitzpatrick , cu ##shing , and st ##ins ##on - re ##mic ##k halls of engineering , includes five departments of study – aerospace and mechanical engineering , chemical and bio ##mo ##le ##cular engineering , civil engineering and geological sciences , computer science and engineering , and electrical engineering – with eight b . s . degrees offered . additionally , the college offers five - year dual degree programs with the colleges of arts and letters and of business awarding additional b . a . and master of business administration ( mba ) degrees , respectively . [SEP]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:token_to_orig_map: 20:0 21:1 22:2 23:3 24:4 25:5 26:6 27:7 28:7 29:8 30:8 31:9 32:10 33:11 34:12 35:13 36:14 37:15 38:16 39:17 40:18 41:19 42:20 43:21 44:22 45:23 46:24 47:25 48:26 49:26 50:27 51:28 52:29 53:29 54:30 55:31 56:32 57:33 58:33 59:34 60:34 61:34 62:35 63:36 64:36 65:36 66:36 67:36 68:36 69:36 70:37 71:38 72:39 73:39 74:40 75:41 76:42 77:43 78:44 79:45 80:46 81:47 82:48 83:49 84:49 85:50 86:51 87:52 88:52 89:52 90:52 91:53 92:53 93:54 94:55 95:56 96:57 97:58 98:58 99:59 100:60 101:61 102:62 103:62 104:63 105:64 106:65 107:66 108:67 109:68 110:69 111:69 112:69 113:69 114:70 115:71 116:71 117:72 118:72 119:73 120:74 121:75 122:76 123:76 124:76 125:77 126:78 127:79 128:80 129:81 130:82 131:83 132:84 133:85 134:86 135:87 136:88 137:89 138:90 139:91 140:92 141:92 142:92 143:92 144:93 145:94 146:95 147:96 148:97 149:98 150:98 151:98 152:99 153:99 154:100 155:100\n",
-      "INFO:tensorflow:token_is_max_context: 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True\n",
-      "INFO:tensorflow:input_ids: 101 1996 2267 1997 2671 2211 2000 3749 2942 3330 5352 2927 2012 2054 2051 2012 10289 8214 1029 102 1996 2267 1997 3330 2001 2511 1999 4444 1010 2174 1010 2220 5352 1999 2942 1998 6228 3330 2020 1037 2112 1997 1996 2267 1997 2671 2144 1996 14896 1012 2651 1996 2267 1010 7431 1999 1996 26249 1010 12731 12227 1010 1998 2358 7076 2239 1011 2128 7712 2243 9873 1997 3330 1010 2950 2274 7640 1997 2817 1516 13395 1998 6228 3330 1010 5072 1998 16012 5302 2571 15431 3330 1010 2942 3330 1998 9843 4163 1010 3274 2671 1998 3330 1010 1998 5992 3330 1516 2007 2809 1038 1012 1055 1012 5445 3253 1012 5678 1010 1996 2267 4107 2274 1011 2095 7037 3014 3454 2007 1996 6667 1997 2840 1998 4144 1998 1997 2449 21467 3176 1038 1012 1037 1012 1998 3040 1997 2449 3447 1006 15038 1007 5445 1010 4414 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:start_position: 47\n",
-      "INFO:tensorflow:end_position: 48\n",
-      "INFO:tensorflow:answer: the 1870s\n"
-     ]
-    }
-   ],
-   "source": [
-    "bert_config = modeling_tensorflow.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = tokenization.BertTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "\n",
-    "eval_examples = read_squad_examples(\n",
-    "    input_file=input_file, is_training=True, max_num=16)\n",
-    "\n",
-    "eval_features = convert_examples_to_features(\n",
-    "    examples=eval_examples,\n",
-    "    tokenizer=tokenizer,\n",
-    "    max_seq_length=max_seq_length,\n",
-    "    doc_stride=doc_stride,\n",
-    "    max_query_length=max_query_length,\n",
-    "    is_training=True)\n",
-    "\n",
-    "# You can use that to test the behavior of the models when target are outside of the model input sequence\n",
-    "# for feature in eval_features:\n",
-    "#     feature.start_position = outside_pos\n",
-    "#     feature.end_position = outside_pos"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.525632Z",
-     "start_time": "2018-11-06T10:11:37.498695Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "eval_unique_id_to_feature = {}\n",
-    "for eval_feature in eval_features:\n",
-    "    eval_unique_id_to_feature[eval_feature.unique_id] = eval_feature"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.558325Z",
-     "start_time": "2018-11-06T10:11:37.527972Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def input_fn_builder(features, seq_length, drop_remainder):\n",
-    "    \"\"\"Creates an `input_fn` closure to be passed to TPUEstimator.\"\"\"\n",
-    "\n",
-    "    all_unique_ids = []\n",
-    "    all_input_ids = []\n",
-    "    all_input_mask = []\n",
-    "    all_segment_ids = []\n",
-    "    all_start_positions = []\n",
-    "    all_end_positions = []\n",
-    "\n",
-    "    for feature in features:\n",
-    "        all_unique_ids.append(feature.unique_id)\n",
-    "        all_input_ids.append(feature.input_ids)\n",
-    "        all_input_mask.append(feature.input_mask)\n",
-    "        all_segment_ids.append(feature.segment_ids)\n",
-    "        all_start_positions.append(feature.start_position)\n",
-    "        all_end_positions.append(feature.end_position)\n",
-    "\n",
-    "    def input_fn(params):\n",
-    "        \"\"\"The actual input function.\"\"\"\n",
-    "        batch_size = params[\"batch_size\"]\n",
-    "\n",
-    "        num_examples = len(features)\n",
-    "\n",
-    "        # This is for demo purposes and does NOT scale to large data sets. We do\n",
-    "        # not use Dataset.from_generator() because that uses tf.py_func which is\n",
-    "        # not TPU compatible. The right way to load data is with TFRecordReader.\n",
-    "        feature_map = {\n",
-    "            \"unique_ids\":\n",
-    "                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),\n",
-    "            \"input_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_input_ids, shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"input_mask\":\n",
-    "                tf.constant(\n",
-    "                    all_input_mask,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"segment_ids\":\n",
-    "                tf.constant(\n",
-    "                    all_segment_ids,\n",
-    "                    shape=[num_examples, seq_length],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"start_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_start_positions,\n",
-    "                    shape=[num_examples],\n",
-    "                    dtype=tf.int32),\n",
-    "            \"end_positions\":\n",
-    "                tf.constant(\n",
-    "                    all_end_positions,\n",
-    "                    shape=[num_examples],\n",
-    "                    dtype=tf.int32),\n",
-    "        }\n",
-    "\n",
-    "        d = tf.data.Dataset.from_tensor_slices(feature_map)\n",
-    "        d = d.repeat()\n",
-    "        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)\n",
-    "        return d\n",
-    "\n",
-    "    return input_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:37.601666Z",
-     "start_time": "2018-11-06T10:11:37.560082Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def model_fn_builder(bert_config, init_checkpoint, learning_rate,\n",
-    "                     num_train_steps, num_warmup_steps, use_tpu,\n",
-    "                     use_one_hot_embeddings):\n",
-    "    \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
-    "\n",
-    "    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
-    "        \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
-    "\n",
-    "        tf.logging.info(\"*** Features ***\")\n",
-    "        for name in sorted(features.keys()):\n",
-    "            tf.logging.info(\"  name = %s, shape = %s\" % (name, features[name].shape))\n",
-    "\n",
-    "        unique_ids = features[\"unique_ids\"]\n",
-    "        input_ids = features[\"input_ids\"]\n",
-    "        input_mask = features[\"input_mask\"]\n",
-    "        segment_ids = features[\"segment_ids\"]\n",
-    "\n",
-    "        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
-    "\n",
-    "        (start_logits, end_logits) = create_model(\n",
-    "            bert_config=bert_config,\n",
-    "            is_training=is_training,\n",
-    "            input_ids=input_ids,\n",
-    "            input_mask=input_mask,\n",
-    "            segment_ids=segment_ids,\n",
-    "            use_one_hot_embeddings=use_one_hot_embeddings)\n",
-    "\n",
-    "        tvars = tf.trainable_variables()\n",
-    "\n",
-    "        initialized_variable_names = {}\n",
-    "        scaffold_fn = None\n",
-    "        if init_checkpoint:\n",
-    "            (assignment_map,\n",
-    "             initialized_variable_names) = modeling_tensorflow.get_assigment_map_from_checkpoint(\n",
-    "                tvars, init_checkpoint)\n",
-    "            if use_tpu:\n",
-    "\n",
-    "                def tpu_scaffold():\n",
-    "                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "                    return tf.train.Scaffold()\n",
-    "\n",
-    "                scaffold_fn = tpu_scaffold\n",
-    "            else:\n",
-    "                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
-    "\n",
-    "        tf.logging.info(\"**** Trainable Variables ****\")\n",
-    "        for var in tvars:\n",
-    "            init_string = \"\"\n",
-    "            if var.name in initialized_variable_names:\n",
-    "                init_string = \", *INIT_FROM_CKPT*\"\n",
-    "            tf.logging.info(\"  name = %s, shape = %s%s\", var.name, var.shape,\n",
-    "                            init_string)\n",
-    "\n",
-    "        output_spec = None\n",
-    "        if mode == tf.estimator.ModeKeys.TRAIN:\n",
-    "            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\n",
-    "\n",
-    "            def compute_loss(logits, positions):\n",
-    "                one_hot_positions = tf.one_hot(\n",
-    "                    positions, depth=seq_length, dtype=tf.float32)\n",
-    "                log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
-    "                loss = -tf.reduce_mean(\n",
-    "                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
-    "                return loss\n",
-    "\n",
-    "            start_positions = features[\"start_positions\"]\n",
-    "            end_positions = features[\"end_positions\"]\n",
-    "\n",
-    "            start_loss = compute_loss(start_logits, start_positions)\n",
-    "            end_loss = compute_loss(end_logits, end_positions)\n",
-    "\n",
-    "            total_loss = (start_loss + end_loss) / 2.0\n",
-    "\n",
-    "            train_op = optimization.create_optimizer(\n",
-    "                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\n",
-    "\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode,\n",
-    "                loss=total_loss,\n",
-    "                train_op=train_op,\n",
-    "                scaffold_fn=scaffold_fn)\n",
-    "        elif mode == tf.estimator.ModeKeys.PREDICT:\n",
-    "            batch_size = modeling_tensorflow.get_shape_list(start_logits)[0]\n",
-    "            seq_length = modeling_tensorflow.get_shape_list(input_ids)[1]\n",
-    "\n",
-    "            def compute_loss(logits, positions):\n",
-    "                one_hot_positions = tf.one_hot(\n",
-    "                    positions, depth=seq_length, dtype=tf.float32)\n",
-    "                log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
-    "                loss = -tf.reduce_mean(\n",
-    "                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
-    "                return loss\n",
-    "\n",
-    "            start_positions = features[\"start_positions\"]\n",
-    "            end_positions = features[\"end_positions\"]\n",
-    "\n",
-    "            start_loss = compute_loss(start_logits, start_positions)\n",
-    "            end_loss = compute_loss(end_logits, end_positions)\n",
-    "\n",
-    "            total_loss = (start_loss + end_loss) / 2.0\n",
-    "\n",
-    "            predictions = {\n",
-    "                \"unique_ids\": unique_ids,\n",
-    "                \"start_logits\": start_logits,\n",
-    "                \"end_logits\": end_logits,\n",
-    "                \"total_loss\": tf.reshape(total_loss, [batch_size, 1]),\n",
-    "                \"start_loss\": tf.reshape(start_loss, [batch_size, 1]),\n",
-    "                \"end_loss\": tf.reshape(end_loss, [batch_size, 1]),\n",
-    "            }\n",
-    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
-    "                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\n",
-    "        else:\n",
-    "            raise ValueError(\n",
-    "                \"Only TRAIN and PREDICT modes are supported: %s\" % (mode))\n",
-    "\n",
-    "        return output_spec\n",
-    "\n",
-    "    return model_fn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:41.104542Z",
-     "start_time": "2018-11-06T10:11:37.603474Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x120df3f28>) includes params argument, but params are not passed to Estimator.\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/tmp/squad_base/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11fd09630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    cluster=None,\n",
-    "    master=None,\n",
-    "    model_dir=output_dir,\n",
-    "    save_checkpoints_steps=1000,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        iterations_per_loop=1000,\n",
-    "        num_shards=8,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    learning_rate=learning_rate,\n",
-    "    num_train_steps=None,\n",
-    "    num_warmup_steps=None,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    train_batch_size=12,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "predict_input_fn = input_fn_builder(\n",
-    "    features=eval_features,\n",
-    "    seq_length=max_seq_length,\n",
-    "    drop_remainder=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.857601Z",
-     "start_time": "2018-11-06T10:11:41.106219Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /tmp/squad_base/, running initialization to predict.\n",
-      "INFO:tensorflow:Calling model_fn.\n",
-      "INFO:tensorflow:Running infer on CPU\n",
-      "INFO:tensorflow:*** Features ***\n",
-      "INFO:tensorflow:  name = end_positions, shape = (1,)\n",
-      "INFO:tensorflow:  name = input_ids, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = input_mask, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = segment_ids, shape = (1, 384)\n",
-      "INFO:tensorflow:  name = start_positions, shape = (1,)\n",
-      "INFO:tensorflow:  name = unique_ids, shape = (1,)\n",
-      "INFO:tensorflow:**** Trainable Variables ****\n",
-      "INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n",
-      "INFO:tensorflow:  name = cls/squad/output_weights:0, shape = (2, 768)\n",
-      "INFO:tensorflow:  name = cls/squad/output_bias:0, shape = (2,)\n",
-      "INFO:tensorflow:Done calling model_fn.\n",
-      "INFO:tensorflow:Graph was finalized.\n",
-      "INFO:tensorflow:Running local_init_op.\n",
-      "INFO:tensorflow:Done running local_init_op.\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "tensorflow_all_results = []\n",
-    "for result in estimator.predict(predict_input_fn, yield_single_examples=True):\n",
-    "    unique_id = int(result[\"unique_ids\"])\n",
-    "    eval_feature = eval_unique_id_to_feature[unique_id]\n",
-    "    start_logits = result[\"start_logits\"]\n",
-    "    end_logits = result[\"end_logits\"]\n",
-    "    total_loss = result[\"total_loss\"]\n",
-    "    start_loss = result[\"start_loss\"]\n",
-    "    end_loss = result[\"end_loss\"]\n",
-    "\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    output_json[\"tokens\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\n",
-    "    output_json[\"start_logits\"] = [round(float(x), 6) for x in start_logits.flat]\n",
-    "    output_json[\"end_logits\"] = [round(float(x), 6) for x in end_logits.flat]\n",
-    "    output_json[\"total_loss\"] = [round(float(x), 6) for x in total_loss.flat]\n",
-    "    output_json[\"start_loss\"] = [round(float(x), 6) for x in start_loss.flat]\n",
-    "    output_json[\"end_loss\"] = [round(float(x), 6) for x in end_loss.flat]\n",
-    "    tensorflow_all_out.append(output_json)\n",
-    "    tensorflow_all_results.append(RawResult(\n",
-    "                                    unique_id=unique_id,\n",
-    "                                    start_logits=start_logits,\n",
-    "                                    end_logits=end_logits))\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.912836Z",
-     "start_time": "2018-11-06T10:11:47.859679Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [],
-   "source": [
-    "def _get_best_indexes(logits, n_best_size):\n",
-    "    \"\"\"Get the n-best logits from a list.\"\"\"\n",
-    "    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n",
-    "\n",
-    "    best_indexes = []\n",
-    "    for i in range(len(index_and_score)):\n",
-    "        if i >= n_best_size:\n",
-    "            break\n",
-    "        best_indexes.append(index_and_score[i][0])\n",
-    "    return best_indexes\n",
-    "\n",
-    "def _compute_softmax(scores):\n",
-    "    \"\"\"Compute softmax probability over raw logits.\"\"\"\n",
-    "    if not scores:\n",
-    "        return []\n",
-    "\n",
-    "    max_score = None\n",
-    "    for score in scores:\n",
-    "        if max_score is None or score > max_score:\n",
-    "            max_score = score\n",
-    "\n",
-    "    exp_scores = []\n",
-    "    total_sum = 0.0\n",
-    "    for score in scores:\n",
-    "        x = math.exp(score - max_score)\n",
-    "        exp_scores.append(x)\n",
-    "        total_sum += x\n",
-    "\n",
-    "    probs = []\n",
-    "    for score in exp_scores:\n",
-    "        probs.append(score / total_sum)\n",
-    "    return probs\n",
-    "\n",
-    "\n",
-    "def compute_predictions(all_examples, all_features, all_results, n_best_size,\n",
-    "                      max_answer_length, do_lower_case):\n",
-    "    \"\"\"Compute final predictions.\"\"\"\n",
-    "    example_index_to_features = collections.defaultdict(list)\n",
-    "    for feature in all_features:\n",
-    "        example_index_to_features[feature.example_index].append(feature)\n",
-    "\n",
-    "    unique_id_to_result = {}\n",
-    "    for result in all_results:\n",
-    "        unique_id_to_result[result.unique_id] = result\n",
-    "\n",
-    "    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "        \"PrelimPrediction\",\n",
-    "        [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"])\n",
-    "\n",
-    "    all_predictions = collections.OrderedDict()\n",
-    "    all_nbest_json = collections.OrderedDict()\n",
-    "    for (example_index, example) in enumerate(all_examples):\n",
-    "        features = example_index_to_features[example_index]\n",
-    "\n",
-    "        prelim_predictions = []\n",
-    "        for (feature_index, feature) in enumerate(features):\n",
-    "            result = unique_id_to_result[feature.unique_id]\n",
-    "\n",
-    "            start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n",
-    "            end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n",
-    "            for start_index in start_indexes:\n",
-    "                for end_index in end_indexes:\n",
-    "                    # We could hypothetically create invalid predictions, e.g., predict\n",
-    "                    # that the start of the span is in the question. We throw out all\n",
-    "                    # invalid predictions.\n",
-    "                    if start_index >= len(feature.tokens):\n",
-    "                        continue\n",
-    "                    if end_index >= len(feature.tokens):\n",
-    "                        continue\n",
-    "                    if start_index not in feature.token_to_orig_map:\n",
-    "                        continue\n",
-    "                    if end_index not in feature.token_to_orig_map:\n",
-    "                        continue\n",
-    "                    if not feature.token_is_max_context.get(start_index, False):\n",
-    "                        continue\n",
-    "                    if end_index < start_index:\n",
-    "                        continue\n",
-    "                    length = end_index - start_index + 1\n",
-    "                    if length > max_answer_length:\n",
-    "                        continue\n",
-    "                    prelim_predictions.append(\n",
-    "                        _PrelimPrediction(\n",
-    "                            feature_index=feature_index,\n",
-    "                            start_index=start_index,\n",
-    "                            end_index=end_index,\n",
-    "                            start_logit=result.start_logits[start_index],\n",
-    "                            end_logit=result.end_logits[end_index]))\n",
-    "\n",
-    "        prelim_predictions = sorted(\n",
-    "            prelim_predictions,\n",
-    "            key=lambda x: (x.start_logit + x.end_logit),\n",
-    "            reverse=True)\n",
-    "\n",
-    "        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "            \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"])\n",
-    "\n",
-    "        seen_predictions = {}\n",
-    "        nbest = []\n",
-    "        for pred in prelim_predictions:\n",
-    "            if len(nbest) >= n_best_size:\n",
-    "                break\n",
-    "            feature = features[pred.feature_index]\n",
-    "\n",
-    "            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\n",
-    "            orig_doc_start = feature.token_to_orig_map[pred.start_index]\n",
-    "            orig_doc_end = feature.token_to_orig_map[pred.end_index]\n",
-    "            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\n",
-    "            tok_text = \" \".join(tok_tokens)\n",
-    "\n",
-    "            # De-tokenize WordPieces that have been split off.\n",
-    "            tok_text = tok_text.replace(\" ##\", \"\")\n",
-    "            tok_text = tok_text.replace(\"##\", \"\")\n",
-    "\n",
-    "            # Clean whitespace\n",
-    "            tok_text = tok_text.strip()\n",
-    "            tok_text = \" \".join(tok_text.split())\n",
-    "            orig_text = \" \".join(orig_tokens)\n",
-    "\n",
-    "            final_text = get_final_text(tok_text, orig_text, do_lower_case)\n",
-    "            if final_text in seen_predictions:\n",
-    "                continue\n",
-    "\n",
-    "            seen_predictions[final_text] = True\n",
-    "            nbest.append(\n",
-    "                _NbestPrediction(\n",
-    "                    text=final_text,\n",
-    "                    start_logit=pred.start_logit,\n",
-    "                    end_logit=pred.end_logit))\n",
-    "\n",
-    "        # In very rare edge cases we could have no valid predictions. So we\n",
-    "        # just create a nonce prediction in this case to avoid failure.\n",
-    "        if not nbest:\n",
-    "            nbest.append(\n",
-    "                _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n",
-    "\n",
-    "        assert len(nbest) >= 1\n",
-    "\n",
-    "        total_scores = []\n",
-    "        for entry in nbest:\n",
-    "            total_scores.append(entry.start_logit + entry.end_logit)\n",
-    "\n",
-    "        probs = _compute_softmax(total_scores)\n",
-    "\n",
-    "        nbest_json = []\n",
-    "        for (i, entry) in enumerate(nbest):\n",
-    "            output = collections.OrderedDict()\n",
-    "            output[\"text\"] = entry.text\n",
-    "            output[\"probability\"] = probs[i]\n",
-    "            output[\"start_logit\"] = entry.start_logit\n",
-    "            output[\"end_logit\"] = entry.end_logit\n",
-    "            nbest_json.append(output)\n",
-    "\n",
-    "        assert len(nbest_json) >= 1\n",
-    "\n",
-    "        all_predictions[example.qas_id] = nbest_json[0][\"text\"]\n",
-    "        all_nbest_json[example.qas_id] = nbest_json\n",
-    "\n",
-    "    return all_predictions, all_nbest_json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.953205Z",
-     "start_time": "2018-11-06T10:11:47.914751Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "all_predictions, all_nbest_json = compute_predictions(eval_examples[:1], eval_features[:1], tensorflow_all_results, 20, max_answer_length, True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:47.994647Z",
-     "start_time": "2018-11-06T10:11:47.955015Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "OrderedDict([('5733be284776f41900661182',\n",
-       "              [OrderedDict([('text', 'empty'),\n",
-       "                            ('probability', 1.0),\n",
-       "                            ('start_logit', 0.0),\n",
-       "                            ('end_logit', 0.0)])])])"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_nbest_json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.028473Z",
-     "start_time": "2018-11-06T10:11:47.996311Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "7\n",
-      "odict_keys(['linex_index', 'tokens', 'start_logits', 'end_logits', 'total_loss', 'start_loss', 'end_loss'])\n",
-      "number of tokens 176\n",
-      "number of start_logits 384\n",
-      "shape of end_logits 384\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(tensorflow_all_out[0]['tokens']))\n",
-    "print(\"number of start_logits\", len(tensorflow_all_out[0]['start_logits']))\n",
-    "print(\"shape of end_logits\", len(tensorflow_all_out[0]['end_logits']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.060658Z",
-     "start_time": "2018-11-06T10:11:48.030289Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "tensorflow_outputs = [tensorflow_all_out[0]['start_logits'], tensorflow_all_out[0]['end_logits'],\n",
-    "                     tensorflow_all_out[0]['total_loss'], tensorflow_all_out[0]['start_loss'],\n",
-    "                     tensorflow_all_out[0]['end_loss']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.478814Z",
-     "start_time": "2018-11-06T10:11:48.062585Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import modeling\n",
-    "from run_squad import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:48.512607Z",
-     "start_time": "2018-11-06T10:11:48.480729Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.023405Z",
-     "start_time": "2018-11-06T10:11:48.514306Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([0., 0.])"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = modeling.BertForQuestionAnswering(bert_config)\n",
-    "model.bert.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
-    "model.to(device)\n",
-    "model.qa_outputs.weight.data.fill_(1.0)\n",
-    "model.qa_outputs.bias.data.zero_()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.079364Z",
-     "start_time": "2018-11-06T10:11:51.028228Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n",
-    "all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n",
-    "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
-    "all_start_positions = torch.tensor([[f.start_position] for f in eval_features], dtype=torch.long)\n",
-    "all_end_positions = torch.tensor([[f.end_position] for f in eval_features], dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n",
-    "                                   all_start_positions, all_end_positions, all_example_index)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()\n",
-    "None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:51.114686Z",
-     "start_time": "2018-11-06T10:11:51.081474Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 384]), torch.Size([1, 1]), torch.Size([1, 1]), torch.Size([1])]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 1])"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "batch = iter(eval_dataloader).next()\n",
-    "input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\n",
-    "print([t.shape for t in batch])\n",
-    "start_positions.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.298367Z",
-     "start_time": "2018-11-06T10:11:51.116219Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Evaluating:   0%|          | 0/270 [00:00<?, ?it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_all_out = []\n",
-    "for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n",
-    "    input_ids, input_mask, segment_ids, start_positions, end_positions, example_index = batch\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "    segment_ids = segment_ids.to(device)\n",
-    "    start_positions = start_positions.to(device)\n",
-    "    end_positions = end_positions.to(device)\n",
-    "\n",
-    "    total_loss, (start_logits, end_logits) = model(input_ids, segment_ids, input_mask, start_positions, end_positions)\n",
-    "    \n",
-    "    eval_feature = eval_features[example_index.item()]\n",
-    "\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    output_json[\"tokens\"] = [token for (i, token) in enumerate(eval_feature.tokens)]\n",
-    "    output_json[\"total_loss\"] = total_loss.detach().cpu().numpy()\n",
-    "    output_json[\"start_logits\"] = start_logits.detach().cpu().numpy()\n",
-    "    output_json[\"end_logits\"] = end_logits.detach().cpu().numpy()\n",
-    "    pytorch_all_out.append(output_json)\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.339553Z",
-     "start_time": "2018-11-06T10:11:52.300335Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "5\n",
-      "odict_keys(['linex_index', 'tokens', 'total_loss', 'start_logits', 'end_logits'])\n",
-      "number of tokens 176\n",
-      "number of start_logits 1\n",
-      "number of end_logits 1\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(pytorch_all_out))\n",
-    "print(len(pytorch_all_out[0]))\n",
-    "print(pytorch_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(pytorch_all_out[0]['tokens']))\n",
-    "print(\"number of start_logits\", len(pytorch_all_out[0]['start_logits']))\n",
-    "print(\"number of end_logits\", len(pytorch_all_out[0]['end_logits']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.372827Z",
-     "start_time": "2018-11-06T10:11:52.341393Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "pytorch_outputs = [pytorch_all_out[0]['start_logits'], pytorch_all_out[0]['end_logits'], pytorch_all_out[0]['total_loss']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3/ Comparing the standard deviation of start_logits, end_logits and loss of both models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.402814Z",
-     "start_time": "2018-11-06T10:11:52.374329Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:11:52.434743Z",
-     "start_time": "2018-11-06T10:11:52.404345Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape tensorflow layer, shape pytorch layer, standard deviation\n",
-      "((384,), (1, 384), 5.244962470555037e-06)\n",
-      "((384,), (1, 384), 5.244962470555037e-06)\n",
-      "((1,), (), 4.560241698925438e-06)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n",
-    "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n",
-    "                          np.array(pytorch_outputs[i]).shape, \n",
-    "                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(3))))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T10:12:54.200059Z",
-     "start_time": "2018-11-06T10:12:54.167355Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total loss of the TF model 9.06024 - Total loss of the PT model 9.0602445602417\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Total loss of the TF model {} - Total loss of the PT model {}\".format(tensorflow_outputs[2][0], pytorch_outputs[2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/server/transformers/notebooks/Comparing-TF-and-PT-models.ipynb b/server/transformers/notebooks/Comparing-TF-and-PT-models.ipynb
deleted file mode 100644
index b7382e4652bc5c1b80c4664811b1f45375483512..0000000000000000000000000000000000000000
--- a/server/transformers/notebooks/Comparing-TF-and-PT-models.ipynb
+++ /dev/null
@@ -1,1318 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing TensorFlow (original) and PyTorch models\n",
-    "\n",
-    "You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\n",
-    "\n",
-    "To run this notebook, follow these instructions:\n",
-    "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n",
-    "- download the original TensorFlow implementation,\n",
-    "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n",
-    "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n",
-    "\n",
-    "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:56:48.412622Z",
-     "start_time": "2018-11-15T14:56:48.400110Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.chdir('../')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1/ TensorFlow code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:56:49.483829Z",
-     "start_time": "2018-11-15T14:56:49.471296Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "original_tf_inplem_dir = \"./tensorflow_code/\"\n",
-    "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n",
-    "\n",
-    "vocab_file = model_dir + \"vocab.txt\"\n",
-    "bert_config_file = model_dir + \"bert_config.json\"\n",
-    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
-    "\n",
-    "input_file = \"./samples/input.txt\"\n",
-    "max_seq_length = 128"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:57:51.597932Z",
-     "start_time": "2018-11-15T14:57:51.549466Z"
-    }
-   },
-   "outputs": [
-    {
-     "ename": "DuplicateFlagError",
-     "evalue": "The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mDuplicateFlagError\u001b[0m                        Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-6-86ecffb49060>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec_from_file_location\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'*'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moriginal_tf_inplem_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/extract_features_tensorflow.py'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_from_spec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexec_module\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'extract_features_tensorflow'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap_external.py\u001b[0m in \u001b[0;36mexec_module\u001b[0;34m(self, module)\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/importlib/_bootstrap.py\u001b[0m in \u001b[0;36m_call_with_frames_removed\u001b[0;34m(f, *args, **kwds)\u001b[0m\n",
-      "\u001b[0;32m~/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/tensorflow_code/extract_features_tensorflow.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     32\u001b[0m \u001b[0mFLAGS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFLAGS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFINE_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"input_file\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     35\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFINE_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"output_file\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/tensorflow/python/platform/flags.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     56\u001b[0m           \u001b[0;34m'Use of the keyword argument names (flag_name, default_value, '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m           'docstring) is deprecated, please use (name, default, help) instead.')\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0moriginal_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m   \u001b[0;32mreturn\u001b[0m \u001b[0mtf_decorator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake_decorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginal_function\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE_string\u001b[0;34m(name, default, help, flag_values, **args)\u001b[0m\n\u001b[1;32m    239\u001b[0m   \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_argument_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArgumentParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    240\u001b[0m   \u001b[0mserializer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_argument_parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArgumentSerializer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 241\u001b[0;31m   \u001b[0mDEFINE\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparser\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhelp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflag_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserializer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE\u001b[0;34m(parser, name, default, help, flag_values, serializer, module_name, **args)\u001b[0m\n\u001b[1;32m     80\u001b[0m   \"\"\"\n\u001b[1;32m     81\u001b[0m   DEFINE_flag(_flag.Flag(parser, serializer, name, default, help, **args),\n\u001b[0;32m---> 82\u001b[0;31m               flag_values, module_name)\n\u001b[0m\u001b[1;32m     83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_defines.py\u001b[0m in \u001b[0;36mDEFINE_flag\u001b[0;34m(flag, flag_values, module_name)\u001b[0m\n\u001b[1;32m    102\u001b[0m   \u001b[0;31m# Copying the reference to flag_values prevents pychecker warnings.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m   \u001b[0mfv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m   \u001b[0mfv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mflag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    105\u001b[0m   \u001b[0;31m# Tell flag_values who's defining the flag.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    106\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mmodule_name\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/miniconda3/envs/bert/lib/python3.6/site-packages/absl/flags/_flagvalues.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, name, flag)\u001b[0m\n\u001b[1;32m    427\u001b[0m         \u001b[0;31m# module is simply being imported a subsequent time.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    428\u001b[0m         \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m       \u001b[0;32mraise\u001b[0m \u001b[0m_exceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDuplicateFlagError\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_flag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    430\u001b[0m     \u001b[0mshort_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshort_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    431\u001b[0m     \u001b[0;31m# If a new flag overrides an old one, we need to cleanup the old flag's\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mDuplicateFlagError\u001b[0m: The flag 'input_file' is defined twice. First from *, Second from *.  Description from first occurrence: (no help available)"
-     ]
-    }
-   ],
-   "source": [
-    "import importlib.util\n",
-    "import sys\n",
-    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features_tensorflow.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['extract_features_tensorflow'] = module\n",
-    "\n",
-    "from extract_features_tensorflow import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:05.650987Z",
-     "start_time": "2018-11-15T14:58:05.541620Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 0\n",
-      "INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\n",
-      "INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    }
-   ],
-   "source": [
-    "layer_indexes = list(range(12))\n",
-    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = tokenization.FullTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "examples = read_examples(input_file)\n",
-    "\n",
-    "features = convert_examples_to_features(\n",
-    "    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\n",
-    "unique_id_to_feature = {}\n",
-    "for feature in features:\n",
-    "    unique_id_to_feature[feature.unique_id] = feature"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:11.562443Z",
-     "start_time": "2018-11-15T14:58:08.036485Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x11ea7f1e0>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x121b163c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
-      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    master=None,\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        num_shards=1,\n",
-    "        per_host_input_for_training=is_per_host))\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    init_checkpoint=init_checkpoint,\n",
-    "    layer_indexes=layer_indexes,\n",
-    "    use_tpu=False,\n",
-    "    use_one_hot_embeddings=False)\n",
-    "\n",
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU.\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=False,\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    predict_batch_size=1)\n",
-    "\n",
-    "input_fn = input_fn_builder(\n",
-    "    features=features, seq_length=max_seq_length)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:21.736543Z",
-     "start_time": "2018-11-15T14:58:16.723829Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphs4_nsq9, running initialization to predict.\n",
-      "INFO:tensorflow:Calling model_fn.\n",
-      "INFO:tensorflow:Running infer on CPU\n",
-      "INFO:tensorflow:Done calling model_fn.\n",
-      "INFO:tensorflow:Graph was finalized.\n",
-      "INFO:tensorflow:Running local_init_op.\n",
-      "INFO:tensorflow:Done running local_init_op.\n",
-      "extracting layer 0\n",
-      "extracting layer 1\n",
-      "extracting layer 2\n",
-      "extracting layer 3\n",
-      "extracting layer 4\n",
-      "extracting layer 5\n",
-      "extracting layer 6\n",
-      "extracting layer 7\n",
-      "extracting layer 8\n",
-      "extracting layer 9\n",
-      "extracting layer 10\n",
-      "extracting layer 11\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n"
-     ]
-    }
-   ],
-   "source": [
-    "tensorflow_all_out = []\n",
-    "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
-    "    unique_id = int(result[\"unique_id\"])\n",
-    "    feature = unique_id_to_feature[unique_id]\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    tensorflow_all_out_features = []\n",
-    "    # for (i, token) in enumerate(feature.tokens):\n",
-    "    all_layers = []\n",
-    "    for (j, layer_index) in enumerate(layer_indexes):\n",
-    "        print(\"extracting layer {}\".format(j))\n",
-    "        layer_output = result[\"layer_output_%d\" % j]\n",
-    "        layers = collections.OrderedDict()\n",
-    "        layers[\"index\"] = layer_index\n",
-    "        layers[\"values\"] = layer_output\n",
-    "        all_layers.append(layers)\n",
-    "    tensorflow_out_features = collections.OrderedDict()\n",
-    "    tensorflow_out_features[\"layers\"] = all_layers\n",
-    "    tensorflow_all_out_features.append(tensorflow_out_features)\n",
-    "\n",
-    "    output_json[\"features\"] = tensorflow_all_out_features\n",
-    "    tensorflow_all_out.append(output_json)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:23.970714Z",
-     "start_time": "2018-11-15T14:58:23.931930Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(len(tensorflow_all_out))\n",
-    "print(len(tensorflow_all_out[0]))\n",
-    "print(tensorflow_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
-    "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
-    "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T14:58:25.547012Z",
-     "start_time": "2018-11-15T14:58:25.516076Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2/ PyTorch code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.chdir('./examples')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:03:49.528679Z",
-     "start_time": "2018-11-15T15:03:49.497697Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import extract_features\n",
-    "import pytorch_transformers as ppb\n",
-    "from extract_features import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:18.001177Z",
-     "start_time": "2018-11-15T15:21:17.970369Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "init_checkpoint_pt = \"../../google_models/uncased_L-12_H-768_A-12/\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:20.893669Z",
-     "start_time": "2018-11-15T15:21:18.786623Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"type_vocab_size\": 2,\n",
-      "  \"vocab_size\": 30522\n",
-      "}\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BertEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BertLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BertEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (pooler): BertPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "device = torch.device(\"cpu\")\n",
-    "model = ppb.BertModel.from_pretrained(init_checkpoint_pt)\n",
-    "model.to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:26.963427Z",
-     "start_time": "2018-11-15T15:21:26.922494Z"
-    },
-    "code_folding": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BertEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BertLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BertEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BertLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BertLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (pooler): BertPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
-    "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
-    "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
-    "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
-    "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
-    "eval_sampler = SequentialSampler(eval_data)\n",
-    "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
-    "\n",
-    "model.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:30.718724Z",
-     "start_time": "2018-11-15T15:21:30.329205Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,\n",
-      "          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-      "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
-      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "tensor([0])\n",
-      "layer 0 0\n",
-      "layer 1 1\n",
-      "layer 2 2\n",
-      "layer 3 3\n",
-      "layer 4 4\n",
-      "layer 5 5\n",
-      "layer 6 6\n",
-      "layer 7 7\n",
-      "layer 8 8\n",
-      "layer 9 9\n",
-      "layer 10 10\n",
-      "layer 11 11\n"
-     ]
-    }
-   ],
-   "source": [
-    "layer_indexes = list(range(12))\n",
-    "\n",
-    "pytorch_all_out = []\n",
-    "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
-    "    print(input_ids)\n",
-    "    print(input_mask)\n",
-    "    print(example_indices)\n",
-    "    input_ids = input_ids.to(device)\n",
-    "    input_mask = input_mask.to(device)\n",
-    "\n",
-    "    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
-    "\n",
-    "    for b, example_index in enumerate(example_indices):\n",
-    "        feature = features[example_index.item()]\n",
-    "        unique_id = int(feature.unique_id)\n",
-    "        # feature = unique_id_to_feature[unique_id]\n",
-    "        output_json = collections.OrderedDict()\n",
-    "        output_json[\"linex_index\"] = unique_id\n",
-    "        all_out_features = []\n",
-    "        # for (i, token) in enumerate(feature.tokens):\n",
-    "        all_layers = []\n",
-    "        for (j, layer_index) in enumerate(layer_indexes):\n",
-    "            print(\"layer\", j, layer_index)\n",
-    "            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
-    "            layer_output = layer_output[b]\n",
-    "            layers = collections.OrderedDict()\n",
-    "            layers[\"index\"] = layer_index\n",
-    "            layer_output = layer_output\n",
-    "            layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
-    "            all_layers.append(layers)\n",
-    "\n",
-    "            out_features = collections.OrderedDict()\n",
-    "            out_features[\"layers\"] = all_layers\n",
-    "            all_out_features.append(out_features)\n",
-    "        output_json[\"features\"] = all_out_features\n",
-    "        pytorch_all_out.append(output_json)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:35.703615Z",
-     "start_time": "2018-11-15T15:21:35.666150Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n",
-      "hidden_size 128\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(len(pytorch_all_out))\n",
-    "print(len(pytorch_all_out[0]))\n",
-    "print(pytorch_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(pytorch_all_out))\n",
-    "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
-    "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
-    "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:36.999073Z",
-     "start_time": "2018-11-15T15:21:36.966762Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
-    "print(pytorch_outputs[0].shape)\n",
-    "print(pytorch_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:37.936522Z",
-     "start_time": "2018-11-15T15:21:37.905269Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tensorflow_outputs[0].shape)\n",
-    "print(tensorflow_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3/ Comparing the standard deviation on the last layer of both models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:39.437137Z",
-     "start_time": "2018-11-15T15:21:39.406150Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-15T15:21:40.181870Z",
-     "start_time": "2018-11-15T15:21:40.137023Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape tensorflow layer, shape pytorch layer, standard deviation\n",
-      "((128, 768), (128, 768), 1.5258875e-07)\n",
-      "((128, 768), (128, 768), 2.342731e-07)\n",
-      "((128, 768), (128, 768), 2.801949e-07)\n",
-      "((128, 768), (128, 768), 3.5904986e-07)\n",
-      "((128, 768), (128, 768), 4.2842768e-07)\n",
-      "((128, 768), (128, 768), 5.127951e-07)\n",
-      "((128, 768), (128, 768), 6.14668e-07)\n",
-      "((128, 768), (128, 768), 7.063922e-07)\n",
-      "((128, 768), (128, 768), 7.906173e-07)\n",
-      "((128, 768), (128, 768), 8.475192e-07)\n",
-      "((128, 768), (128, 768), 8.975489e-07)\n",
-      "((128, 768), (128, 768), 4.1671223e-07)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n",
-    "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n",
-    "                          np.array(pytorch_outputs[i]).shape, \n",
-    "                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "hide_input": false,
-  "kernelspec": {
-   "display_name": "Python [default]",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
-  },
-  "toc": {
-   "colors": {
-    "hover_highlight": "#DAA520",
-    "running_highlight": "#FF0000",
-    "selected_highlight": "#FFD700"
-   },
-   "moveMenuLeft": true,
-   "nav_menu": {
-    "height": "48px",
-    "width": "252px"
-   },
-   "navigate_menu": true,
-   "number_sections": true,
-   "sideBar": true,
-   "threshold": 4,
-   "toc_cell": false,
-   "toc_section_display": "block",
-   "toc_window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/server/transformers/notebooks/Test Models.ipynb b/server/transformers/notebooks/Test Models.ipynb
deleted file mode 100644
index 18ec939217d2178010e65a1378b7563db2fd59a5..0000000000000000000000000000000000000000
--- a/server/transformers/notebooks/Test Models.ipynb	
+++ /dev/null
@@ -1,526 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%reload_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import transformers\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoModel, AutoTokenizer, BertModel, DistilBertModel, RobertaModel, GPT2Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mname = 'bert-base-uncased'\n",
-    "sentence = 'The count went forward with his original plan'\n",
-    "t_class = BertModel\n",
-    "\n",
-    "def test_model(t_class, mname, sentence):\n",
-    "    m = t_class.from_pretrained(mname, output_hidden_states=True, output_past=False, output_attentions=True, output_additional_info=True)\n",
-    "    t = AutoTokenizer.from_pretrained(mname)\n",
-    "    input_ids = t.encode(sentence)\n",
-    "    outputs = m(torch.tensor(input_ids).unsqueeze(0))\n",
-    "    return outputs\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mname = 'bert-base-uncased'\n",
-    "sentence = 'The count went forward with his original plan'\n",
-    "t_class = BertModel\n",
-    "out = test_model(t_class, mname, sentence)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mname = 'distilbert-base-uncased'\n",
-    "sentence = 'The count went forward with his original plan'\n",
-    "t_class = DistilBertModel\n",
-    "out = test_model(t_class, mname, sentence)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mname = 'roberta-base'\n",
-    "sentence = 'The count went forward with his original plan'\n",
-    "t_class = RobertaModel\n",
-    "out = test_model(t_class, mname, sentence)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n",
-      "CONTEXTS:  torch.Size([1, 16, 12, 64])\n"
-     ]
-    }
-   ],
-   "source": [
-    "mname = 'gpt2'\n",
-    "sentence = 'The count went forward with his original plan to take over the mighty world of Disney'\n",
-    "t_class = GPT2Model\n",
-    "out = test_model(t_class, mname, sentence)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4"
-      ]
-     },
-     "execution_count": 123,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(out)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 16, 12, 64])"
-      ]
-     },
-     "execution_count": 124,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "out[-1][0].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 16, 768])"
-      ]
-     },
-     "execution_count": 120,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "out[1][0].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "12"
-      ]
-     },
-     "execution_count": 107,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "m = t_class.from_pretrained(mname)\n",
-    "\n",
-    "m.config.n_head"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "GPT2Config {\n",
-       "  \"attn_pdrop\": 0.1,\n",
-       "  \"bos_token_id\": 0,\n",
-       "  \"do_sample\": false,\n",
-       "  \"embd_pdrop\": 0.1,\n",
-       "  \"eos_token_ids\": 0,\n",
-       "  \"finetuning_task\": null,\n",
-       "  \"id2label\": {\n",
-       "    \"0\": \"LABEL_0\",\n",
-       "    \"1\": \"LABEL_1\"\n",
-       "  },\n",
-       "  \"initializer_range\": 0.02,\n",
-       "  \"is_decoder\": false,\n",
-       "  \"label2id\": {\n",
-       "    \"LABEL_0\": 0,\n",
-       "    \"LABEL_1\": 1\n",
-       "  },\n",
-       "  \"layer_norm_epsilon\": 1e-05,\n",
-       "  \"length_penalty\": 1.0,\n",
-       "  \"max_length\": 20,\n",
-       "  \"model_type\": \"gpt2\",\n",
-       "  \"n_ctx\": 1024,\n",
-       "  \"n_embd\": 768,\n",
-       "  \"n_head\": 12,\n",
-       "  \"n_layer\": 12,\n",
-       "  \"n_positions\": 1024,\n",
-       "  \"num_beams\": 1,\n",
-       "  \"num_labels\": 2,\n",
-       "  \"num_return_sequences\": 1,\n",
-       "  \"output_additional_info\": false,\n",
-       "  \"output_attentions\": false,\n",
-       "  \"output_hidden_states\": false,\n",
-       "  \"output_past\": true,\n",
-       "  \"pad_token_id\": 0,\n",
-       "  \"pruned_heads\": {},\n",
-       "  \"repetition_penalty\": 1.0,\n",
-       "  \"resid_pdrop\": 0.1,\n",
-       "  \"summary_activation\": null,\n",
-       "  \"summary_first_dropout\": 0.1,\n",
-       "  \"summary_proj_to_labels\": true,\n",
-       "  \"summary_type\": \"cls_index\",\n",
-       "  \"summary_use_proj\": true,\n",
-       "  \"temperature\": 1.0,\n",
-       "  \"top_k\": 50,\n",
-       "  \"top_p\": 1.0,\n",
-       "  \"torchscript\": false,\n",
-       "  \"use_bfloat16\": false,\n",
-       "  \"vocab_size\": 50257\n",
-       "}"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "m.config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 12, 8, 8])"
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "out[-1][1].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 1, 12, 8, 64])"
-      ]
-     },
-     "execution_count": 77,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "out[1][0].shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Tokenizing smiles"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['C', 'O', '.', 'C', 'O', 'C', '(', '=', 'O', ')', 'C', '(', 'C', ')', '(', 'C', ')', 'c', '1', 'c', 'c', 'c', '(', 'C', '(', '=', 'O', ')', 'C', 'C', 'C', 'N', '2', 'C', 'C', 'C', '(', 'C', '(', 'O', ')', '(', 'c', '3', 'c', 'c', 'c', 'c', 'c', '3', ')', 'c', '3', 'c', 'c', 'c', 'c', 'c', '3', ')', 'C', 'C', '2', ')', 'c', 'c', '1', '.', 'Cl', '.', 'O', '[Na]', '>>', 'C', 'C', '(', 'C', ')', '(', 'C', '(', '=', 'O', ')', 'O', ')', 'c', '1', 'c', 'c', 'c', '(', 'C', '(', '=', 'O', ')', 'C', 'C', 'C', 'N', '2', 'C', 'C', 'C', '(', 'C', '(', 'O', ')', '(', 'c', '3', 'c', 'c', 'c', 'c', 'c', '3', ')', 'c', '3', 'c', 'c', 'c', 'c', 'c', '3', ')', 'C', 'C', '2', ')', 'c', 'c', '1']\n"
-     ]
-    }
-   ],
-   "source": [
-    "import re\n",
-    "import regex\n",
-    "\n",
-    "def tokenize_smiles(smiles: str) -> str:\n",
-    "    \"\"\"\n",
-    "    Tokenize a SMILES molecule or reaction\n",
-    "    \"\"\"\n",
-    "    pattern = r\"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|\\%\\([0-9]{3}\\)|[0-9])\"\n",
-    "    regex = re.compile(pattern)\n",
-    "    tokens = [token for token in regex.findall(smiles)]\n",
-    "    if smiles != ''.join(tokens):\n",
-    "        raise \n",
-    "#     return ' '.join(tokens)\n",
-    "    return tokens\n",
-    "\n",
-    "\n",
-    "rxn = 'CO.COC(=O)C(C)(C)c1ccc(C(=O)CCCN2CCC(C(O)(c3ccccc3)c3ccccc3)CC2)cc1.Cl.O[Na]>>CC(C)(C(=O)O)c1ccc(C(=O)CCCN2CCC(C(O)(c3ccccc3)c3ccccc3)CC2)cc1'\n",
-    "tokenized_rxn = tokenize_smiles(rxn)\n",
-    "print(tokenized_rxn)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['C',\n",
-       " 'O',\n",
-       " '.',\n",
-       " 'C',\n",
-       " 'O',\n",
-       " 'C',\n",
-       " '(',\n",
-       " '=',\n",
-       " 'O',\n",
-       " ')',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'C',\n",
-       " ')',\n",
-       " '(',\n",
-       " 'C',\n",
-       " ')',\n",
-       " 'c',\n",
-       " '1',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '(',\n",
-       " 'C',\n",
-       " '(',\n",
-       " '=',\n",
-       " 'O',\n",
-       " ')',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'N',\n",
-       " '2',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'O',\n",
-       " ')',\n",
-       " '(',\n",
-       " 'c',\n",
-       " '3',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '3',\n",
-       " ')',\n",
-       " 'c',\n",
-       " '3',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '3',\n",
-       " ')',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " '2',\n",
-       " ')',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '1',\n",
-       " '.',\n",
-       " 'Cl',\n",
-       " '.',\n",
-       " 'O',\n",
-       " '[Na]',\n",
-       " '>>',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'C',\n",
-       " ')',\n",
-       " '(',\n",
-       " 'C',\n",
-       " '(',\n",
-       " '=',\n",
-       " 'O',\n",
-       " ')',\n",
-       " 'O',\n",
-       " ')',\n",
-       " 'c',\n",
-       " '1',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '(',\n",
-       " 'C',\n",
-       " '(',\n",
-       " '=',\n",
-       " 'O',\n",
-       " ')',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'N',\n",
-       " '2',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'C',\n",
-       " '(',\n",
-       " 'O',\n",
-       " ')',\n",
-       " '(',\n",
-       " 'c',\n",
-       " '3',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '3',\n",
-       " ')',\n",
-       " 'c',\n",
-       " '3',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '3',\n",
-       " ')',\n",
-       " 'C',\n",
-       " 'C',\n",
-       " '2',\n",
-       " ')',\n",
-       " 'c',\n",
-       " 'c',\n",
-       " '1']"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tokenized_rxn"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python [conda env:tformers] *",
-   "language": "python",
-   "name": "conda-env-tformers-py"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/server/transformers/setup.cfg b/server/transformers/setup.cfg
deleted file mode 100644
index e69f8d5551226f7a92bff57d2614af0533abd35b..0000000000000000000000000000000000000000
--- a/server/transformers/setup.cfg
+++ /dev/null
@@ -1,34 +0,0 @@
-[isort]
-ensure_newline_before_comments = True
-force_grid_wrap = 0
-include_trailing_comma = True
-known_first_party = transformers
-known_third_party =
-    absl
-    fairseq
-    fastprogress
-    git
-    h5py
-    MeCab
-    nltk
-    numpy
-    packaging
-    PIL
-    psutil
-    seqeval
-    sklearn
-    tensorboardX
-    tensorflow
-    tensorflow_datasets
-    torch
-    torchtext
-    torchvision
-
-line_length = 119
-lines_after_imports = 2
-multi_line_output = 3
-use_parentheses = True
-
-[flake8]
-ignore = E203, E501, W503
-max-line-length = 119
diff --git a/server/transformers/setup.py b/server/transformers/setup.py
deleted file mode 100644
index b36d51e719bc6c0428dce7237c797db45e195ce1..0000000000000000000000000000000000000000
--- a/server/transformers/setup.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
-
-To create the package for pypi.
-
-1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
-
-2. Commit these changes with the message: "Release: VERSION"
-
-3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
-   Push the tag to git: git push --tags origin master
-
-4. Build both the sources and the wheel. Do not change anything in setup.py between
-   creating the wheel and the source distribution (obviously).
-
-   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
-   (this will build a wheel for the python version you use to build it).
-
-   For the sources, run: "python setup.py sdist"
-   You should now have a /dist directory with both .whl and .tar.gz source versions.
-
-5. Check that everything looks correct by uploading the package to the pypi test server:
-
-   twine upload dist/* -r pypitest
-   (pypi suggest using twine as other methods upload files via plaintext.)
-   You may have to specify the repository url, use the following command then:
-   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
-
-   Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi transformers
-
-6. Upload the final version to actual pypi:
-   twine upload dist/* -r pypi
-
-7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
-
-"""
-
-import shutil
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-
-# Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
-stale_egg_info = Path(__file__).parent / "transformers.egg-info"
-if stale_egg_info.exists():
-    print(
-        (
-            "Warning: {} exists.\n\n"
-            "If you recently updated transformers to 3.0 or later, this is expected,\n"
-            "but it may prevent transformers from installing in editable mode.\n\n"
-            "This directory is automatically generated by Python's packaging tools.\n"
-            "I will remove it now.\n\n"
-            "See https://github.com/pypa/pip/issues/5466 for details.\n"
-        ).format(stale_egg_info)
-    )
-    shutil.rmtree(stale_egg_info)
-
-
-extras = {}
-
-extras["mecab"] = ["mecab-python3"]
-extras["sklearn"] = ["scikit-learn"]
-extras["tf"] = ["tensorflow"]
-extras["torch"] = ["torch"]
-
-extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
-extras["all"] = extras["serving"] + ["tensorflow", "torch"]
-
-extras["testing"] = ["pytest", "pytest-xdist"]
-extras["quality"] = ["black", "isort", "flake8"]
-extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"]
-extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
-
-setup(
-    name="transformers",
-    version="2.4.1",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
-    author_email="thomas@huggingface.co",
-    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
-    license="Apache",
-    url="https://github.com/huggingface/transformers",
-    package_dir={"": "src"},
-    packages=find_packages("src"),
-    install_requires=[
-        "numpy",
-        "tokenizers == 0.0.11",
-        # accessing files from S3 directly
-        "boto3",
-        # filesystem locks e.g. to prevent parallel downloads
-        "filelock",
-        # for downloading models over HTTPS
-        "requests",
-        # progress bars in model download and training scripts
-        "tqdm >= 4.27",
-        # for OpenAI GPT
-        "regex != 2019.12.17",
-        # for XLNet
-        "sentencepiece",
-        # for XLM
-        "sacremoses",
-    ],
-    extras_require=extras,
-    scripts=["transformers-cli"],
-    python_requires=">=3.5.0",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Education",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    ],
-)
diff --git a/server/transformers/src/transformers/__init__.py b/server/transformers/src/transformers/__init__.py
deleted file mode 100755
index 3cbbf815d65167b8b35f41b220a102dc8f3f84dd..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/__init__.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-__version__ = "2.4.1"
-
-# Work around to update TensorFlow's absl.logging threshold which alters the
-# default Python logging output behavior when present.
-# see: https://github.com/abseil/abseil-py/issues/99
-# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
-try:
-    import absl.logging
-except ImportError:
-    pass
-else:
-    absl.logging.set_verbosity("info")
-    absl.logging.set_stderrthreshold("info")
-    absl.logging._warn_preinit_stderr = False
-
-import logging
-
-from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
-from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
-from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
-from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
-from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
-from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
-from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
-from .configuration_mmbt import MMBTConfig
-from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
-from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
-from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
-from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-
-# Configurations
-from .configuration_utils import PretrainedConfig
-from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
-from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
-from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
-from .data import (
-    DataProcessor,
-    InputExample,
-    InputFeatures,
-    SingleSentenceClassificationProcessor,
-    SquadExample,
-    SquadFeatures,
-    SquadV1Processor,
-    SquadV2Processor,
-    glue_convert_examples_to_features,
-    glue_output_modes,
-    glue_processors,
-    glue_tasks_num_labels,
-    is_sklearn_available,
-    squad_convert_examples_to_features,
-    xnli_output_modes,
-    xnli_processors,
-    xnli_tasks_num_labels,
-)
-
-# Files and general utilities
-from .file_utils import (
-    CONFIG_NAME,
-    MODEL_CARD_NAME,
-    PYTORCH_PRETRAINED_BERT_CACHE,
-    PYTORCH_TRANSFORMERS_CACHE,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    TRANSFORMERS_CACHE,
-    WEIGHTS_NAME,
-    add_end_docstrings,
-    add_start_docstrings,
-    cached_path,
-    is_tf_available,
-    is_torch_available,
-)
-
-# Model Cards
-from .modelcard import ModelCard
-
-# TF 2.0 <=> PyTorch conversion utilities
-from .modeling_tf_pytorch_utils import (
-    convert_tf_weight_name_to_pt_weight_name,
-    load_pytorch_checkpoint_in_tf2_model,
-    load_pytorch_model_in_tf2_model,
-    load_pytorch_weights_in_tf2_model,
-    load_tf2_checkpoint_in_pytorch_model,
-    load_tf2_model_in_pytorch_model,
-    load_tf2_weights_in_pytorch_model,
-)
-
-# Pipelines
-from .pipelines import (
-    CsvPipelineDataFormat,
-    FeatureExtractionPipeline,
-    FillMaskPipeline,
-    JsonPipelineDataFormat,
-    NerPipeline,
-    PipedPipelineDataFormat,
-    Pipeline,
-    PipelineDataFormat,
-    QuestionAnsweringPipeline,
-    TextClassificationPipeline,
-    pipeline,
-)
-from .tokenization_albert import AlbertTokenizer
-from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
-from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
-from .tokenization_camembert import CamembertTokenizer
-from .tokenization_ctrl import CTRLTokenizer
-from .tokenization_distilbert import DistilBertTokenizer
-from .tokenization_flaubert import FlaubertTokenizer
-from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_roberta import RobertaTokenizer
-from .tokenization_t5 import T5Tokenizer
-from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
-
-# Tokenizers
-from .tokenization_utils import PreTrainedTokenizer
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_xlm_roberta import XLMRobertaTokenizer
-from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
-
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-if is_sklearn_available():
-    from .data import glue_compute_metrics, xnli_compute_metrics
-
-
-# Modeling
-if is_torch_available():
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
-    from .modeling_auto import (
-        AutoModel,
-        AutoModelForPreTraining,
-        AutoModelForSequenceClassification,
-        AutoModelForQuestionAnswering,
-        AutoModelWithLMHead,
-        AutoModelForTokenClassification,
-        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_bert import (
-        BertPreTrainedModel,
-        BertModel,
-        BertForPreTraining,
-        BertForMaskedLM,
-        BertForNextSentencePrediction,
-        BertForSequenceClassification,
-        BertForMultipleChoice,
-        BertForTokenClassification,
-        BertForQuestionAnswering,
-        load_tf_weights_in_bert,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_openai import (
-        OpenAIGPTPreTrainedModel,
-        OpenAIGPTModel,
-        OpenAIGPTLMHeadModel,
-        OpenAIGPTDoubleHeadsModel,
-        load_tf_weights_in_openai_gpt,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_transfo_xl import (
-        TransfoXLPreTrainedModel,
-        TransfoXLModel,
-        TransfoXLLMHeadModel,
-        AdaptiveEmbedding,
-        load_tf_weights_in_transfo_xl,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_gpt2 import (
-        GPT2PreTrainedModel,
-        GPT2Model,
-        GPT2LMHeadModel,
-        GPT2DoubleHeadsModel,
-        load_tf_weights_in_gpt2,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-    from .modeling_xlnet import (
-        XLNetPreTrainedModel,
-        XLNetModel,
-        XLNetLMHeadModel,
-        XLNetForSequenceClassification,
-        XLNetForTokenClassification,
-        XLNetForMultipleChoice,
-        XLNetForQuestionAnsweringSimple,
-        XLNetForQuestionAnswering,
-        load_tf_weights_in_xlnet,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_xlm import (
-        XLMPreTrainedModel,
-        XLMModel,
-        XLMWithLMHeadModel,
-        XLMForSequenceClassification,
-        XLMForQuestionAnswering,
-        XLMForQuestionAnsweringSimple,
-        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_roberta import (
-        RobertaForMaskedLM,
-        RobertaModel,
-        RobertaForSequenceClassification,
-        RobertaForMultipleChoice,
-        RobertaForTokenClassification,
-        RobertaForQuestionAnswering,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_camembert import (
-        CamembertForMaskedLM,
-        CamembertModel,
-        CamembertForSequenceClassification,
-        CamembertForTokenClassification,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_distilbert import (
-        DistilBertPreTrainedModel,
-        DistilBertForMaskedLM,
-        DistilBertModel,
-        DistilBertForSequenceClassification,
-        DistilBertForQuestionAnswering,
-        DistilBertForTokenClassification,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_camembert import (
-        CamembertForMaskedLM,
-        CamembertModel,
-        CamembertForSequenceClassification,
-        CamembertForMultipleChoice,
-        CamembertForTokenClassification,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
-    from .modeling_t5 import (
-        T5PreTrainedModel,
-        T5Model,
-        T5WithLMHeadModel,
-        load_tf_weights_in_t5,
-        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_albert import (
-        AlbertPreTrainedModel,
-        AlbertModel,
-        AlbertForMaskedLM,
-        AlbertForSequenceClassification,
-        AlbertForQuestionAnswering,
-        load_tf_weights_in_albert,
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_xlm_roberta import (
-        XLMRobertaForMaskedLM,
-        XLMRobertaModel,
-        XLMRobertaForMultipleChoice,
-        XLMRobertaForSequenceClassification,
-        XLMRobertaForTokenClassification,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-    from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
-
-    from .modeling_flaubert import (
-        FlaubertModel,
-        FlaubertWithLMHeadModel,
-        FlaubertForSequenceClassification,
-        FlaubertForQuestionAnswering,
-        FlaubertForQuestionAnsweringSimple,
-        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    # Optimization
-    from .optimization import (
-        AdamW,
-        get_constant_schedule,
-        get_constant_schedule_with_warmup,
-        get_cosine_schedule_with_warmup,
-        get_cosine_with_hard_restarts_schedule_with_warmup,
-        get_linear_schedule_with_warmup,
-    )
-
-
-# TensorFlow
-if is_tf_available():
-    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
-    from .modeling_tf_auto import (
-        TFAutoModel,
-        TFAutoModelForPreTraining,
-        TFAutoModelForSequenceClassification,
-        TFAutoModelForQuestionAnswering,
-        TFAutoModelWithLMHead,
-        TFAutoModelForTokenClassification,
-        TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_bert import (
-        TFBertPreTrainedModel,
-        TFBertMainLayer,
-        TFBertEmbeddings,
-        TFBertModel,
-        TFBertForPreTraining,
-        TFBertForMaskedLM,
-        TFBertForNextSentencePrediction,
-        TFBertForSequenceClassification,
-        TFBertForMultipleChoice,
-        TFBertForTokenClassification,
-        TFBertForQuestionAnswering,
-        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_gpt2 import (
-        TFGPT2PreTrainedModel,
-        TFGPT2MainLayer,
-        TFGPT2Model,
-        TFGPT2LMHeadModel,
-        TFGPT2DoubleHeadsModel,
-        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_openai import (
-        TFOpenAIGPTPreTrainedModel,
-        TFOpenAIGPTMainLayer,
-        TFOpenAIGPTModel,
-        TFOpenAIGPTLMHeadModel,
-        TFOpenAIGPTDoubleHeadsModel,
-        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_transfo_xl import (
-        TFTransfoXLPreTrainedModel,
-        TFTransfoXLMainLayer,
-        TFTransfoXLModel,
-        TFTransfoXLLMHeadModel,
-        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_xlnet import (
-        TFXLNetPreTrainedModel,
-        TFXLNetMainLayer,
-        TFXLNetModel,
-        TFXLNetLMHeadModel,
-        TFXLNetForSequenceClassification,
-        TFXLNetForTokenClassification,
-        TFXLNetForQuestionAnsweringSimple,
-        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_xlm import (
-        TFXLMPreTrainedModel,
-        TFXLMMainLayer,
-        TFXLMModel,
-        TFXLMWithLMHeadModel,
-        TFXLMForSequenceClassification,
-        TFXLMForQuestionAnsweringSimple,
-        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_xlm_roberta import (
-        TFXLMRobertaForMaskedLM,
-        TFXLMRobertaModel,
-        TFXLMRobertaForSequenceClassification,
-        TFXLMRobertaForTokenClassification,
-        TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_roberta import (
-        TFRobertaPreTrainedModel,
-        TFRobertaMainLayer,
-        TFRobertaModel,
-        TFRobertaForMaskedLM,
-        TFRobertaForSequenceClassification,
-        TFRobertaForTokenClassification,
-        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_camembert import (
-        TFCamembertModel,
-        TFCamembertForMaskedLM,
-        TFCamembertForSequenceClassification,
-        TFCamembertForTokenClassification,
-        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_distilbert import (
-        TFDistilBertPreTrainedModel,
-        TFDistilBertMainLayer,
-        TFDistilBertModel,
-        TFDistilBertForMaskedLM,
-        TFDistilBertForSequenceClassification,
-        TFDistilBertForTokenClassification,
-        TFDistilBertForQuestionAnswering,
-        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_ctrl import (
-        TFCTRLPreTrainedModel,
-        TFCTRLModel,
-        TFCTRLLMHeadModel,
-        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_albert import (
-        TFAlbertPreTrainedModel,
-        TFAlbertModel,
-        TFAlbertForMaskedLM,
-        TFAlbertForSequenceClassification,
-        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    from .modeling_tf_t5 import (
-        TFT5PreTrainedModel,
-        TFT5Model,
-        TFT5WithLMHeadModel,
-        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-    # Optimization
-    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
-
-
-if not is_tf_available() and not is_torch_available():
-    logger.warning(
-        "Neither PyTorch nor TensorFlow >= 2.0 have been found."
-        "Models won't be available and only tokenizers, configuration"
-        "and file/data utilities can be used."
-    )
diff --git a/server/transformers/src/transformers/commands/__init__.py b/server/transformers/src/transformers/commands/__init__.py
deleted file mode 100644
index 13171f42853e27083c89bc7d2a648a2ba3287c20..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from abc import ABC, abstractmethod
-from argparse import ArgumentParser
-
-
-class BaseTransformersCLICommand(ABC):
-    @staticmethod
-    @abstractmethod
-    def register_subcommand(parser: ArgumentParser):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def run(self):
-        raise NotImplementedError()
diff --git a/server/transformers/src/transformers/commands/convert.py b/server/transformers/src/transformers/commands/convert.py
deleted file mode 100644
index a31ef53b624dec01e849e58a05f1e7591acdb1ab..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/convert.py
+++ /dev/null
@@ -1,144 +0,0 @@
-from argparse import ArgumentParser, Namespace
-from logging import getLogger
-
-from transformers.commands import BaseTransformersCLICommand
-
-
-def convert_command_factory(args: Namespace):
-    """
-    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
-    :return: ServeCommand
-    """
-    return ConvertCommand(
-        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
-    )
-
-
-class ConvertCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        """
-        Register this command to argparse so it's available for the transformer-cli
-        :param parser: Root parser to register command-specific arguments
-        :return:
-        """
-        train_parser = parser.add_parser(
-            "convert",
-            help="CLI tool to run convert model from original "
-            "author checkpoints to Transformers PyTorch checkpoints.",
-        )
-        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
-        train_parser.add_argument(
-            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
-        )
-        train_parser.add_argument(
-            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
-        )
-        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
-        train_parser.add_argument(
-            "--finetuning_task_name",
-            type=str,
-            default=None,
-            help="Optional fine-tuning task name if the TF model was a finetuned model.",
-        )
-        train_parser.set_defaults(func=convert_command_factory)
-
-    def __init__(
-        self,
-        model_type: str,
-        tf_checkpoint: str,
-        pytorch_dump_output: str,
-        config: str,
-        finetuning_task_name: str,
-        *args
-    ):
-        self._logger = getLogger("transformers-cli/converting")
-
-        self._logger.info("Loading model {}".format(model_type))
-        self._model_type = model_type
-        self._tf_checkpoint = tf_checkpoint
-        self._pytorch_dump_output = pytorch_dump_output
-        self._config = config
-        self._finetuning_task_name = finetuning_task_name
-
-    def run(self):
-        if self._model_type == "bert":
-            try:
-                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
-                    convert_tf_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
-
-            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "gpt":
-            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
-                convert_openai_checkpoint_to_pytorch,
-            )
-
-            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "transfo_xl":
-            try:
-                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
-                    convert_transfo_xl_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
-
-            if "ckpt" in self._tf_checkpoint.lower():
-                TF_CHECKPOINT = self._tf_checkpoint
-                TF_DATASET_FILE = ""
-            else:
-                TF_DATASET_FILE = self._tf_checkpoint
-                TF_CHECKPOINT = ""
-            convert_transfo_xl_checkpoint_to_pytorch(
-                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
-            )
-        elif self._model_type == "gpt2":
-            try:
-                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
-                    convert_gpt2_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
-
-            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "xlnet":
-            try:
-                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
-                    convert_xlnet_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                msg = (
-                    "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions."
-                )
-                raise ImportError(msg)
-
-            convert_xlnet_checkpoint_to_pytorch(
-                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
-            )
-        elif self._model_type == "xlm":
-            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
-                convert_xlm_checkpoint_to_pytorch,
-            )
-
-            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
-        else:
-            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
diff --git a/server/transformers/src/transformers/commands/download.py b/server/transformers/src/transformers/commands/download.py
deleted file mode 100644
index acfb3eeb927f6d2d30e8fb49d00183fc53de8770..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/download.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from argparse import ArgumentParser
-
-from transformers.commands import BaseTransformersCLICommand
-
-
-def download_command_factory(args):
-    return DownloadCommand(args.model, args.cache_dir, args.force)
-
-
-class DownloadCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser("download")
-        download_parser.add_argument(
-            "--cache-dir", type=str, default=None, help="Path to location to store the models"
-        )
-        download_parser.add_argument(
-            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
-        )
-        download_parser.add_argument("model", type=str, help="Name of the model to download")
-        download_parser.set_defaults(func=download_command_factory)
-
-    def __init__(self, model: str, cache: str, force: bool):
-        self._model = model
-        self._cache = cache
-        self._force = force
-
-    def run(self):
-        from transformers import AutoModel, AutoTokenizer
-
-        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
-        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
diff --git a/server/transformers/src/transformers/commands/env.py b/server/transformers/src/transformers/commands/env.py
deleted file mode 100644
index efc8fbb683c61bea4896023caabe9cba2c2ea583..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/env.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import platform
-from argparse import ArgumentParser
-
-from transformers import __version__ as version
-from transformers import is_tf_available, is_torch_available
-from transformers.commands import BaseTransformersCLICommand
-
-
-def info_command_factory(_):
-    return EnvironmentCommand()
-
-
-class EnvironmentCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser("env")
-        download_parser.set_defaults(func=info_command_factory)
-
-    def run(self):
-        pt_version = "not installed"
-        pt_cuda_available = "NA"
-        if is_torch_available():
-            import torch
-
-            pt_version = torch.__version__
-            pt_cuda_available = torch.cuda.is_available()
-
-        tf_version = "not installed"
-        tf_cuda_available = "NA"
-        if is_tf_available():
-            import tensorflow as tf
-
-            tf_version = tf.__version__
-            try:
-                # deprecated in v2.1
-                tf_cuda_available = tf.test.is_gpu_available()
-            except AttributeError:
-                # returns list of devices, convert to bool
-                tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
-
-        info = {
-            "`transformers` version": version,
-            "Platform": platform.platform(),
-            "Python version": platform.python_version(),
-            "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
-            "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
-            "Using GPU in script?": "<fill in>",
-            "Using distributed or parallel set-up in script?": "<fill in>",
-        }
-
-        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
-        print(self.format_dict(info))
-
-        return info
-
-    @staticmethod
-    def format_dict(d):
-        return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
diff --git a/server/transformers/src/transformers/commands/run.py b/server/transformers/src/transformers/commands/run.py
deleted file mode 100644
index fdc88c55e4a847a160bf9549d8d44d5ea0b6c570..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/run.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import logging
-from argparse import ArgumentParser
-
-from transformers.commands import BaseTransformersCLICommand
-from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
-
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-def try_infer_format_from_ext(path: str):
-    if not path:
-        return "pipe"
-
-    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
-        if path.endswith(ext):
-            return ext
-
-    raise Exception(
-        "Unable to determine file format from file extension {}. "
-        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
-    )
-
-
-def run_command_factory(args):
-    nlp = pipeline(
-        task=args.task,
-        model=args.model if args.model else None,
-        config=args.config,
-        tokenizer=args.tokenizer,
-        device=args.device,
-    )
-    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
-    reader = PipelineDataFormat.from_str(
-        format=format,
-        output_path=args.output,
-        input_path=args.input,
-        column=args.column if args.column else nlp.default_input_names,
-        overwrite=args.overwrite,
-    )
-    return RunCommand(nlp, reader)
-
-
-class RunCommand(BaseTransformersCLICommand):
-    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
-        self._nlp = nlp
-        self._reader = reader
-
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
-        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
-        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
-        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
-        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
-        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
-        run_parser.add_argument(
-            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
-        )
-        run_parser.add_argument(
-            "--column",
-            type=str,
-            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
-        )
-        run_parser.add_argument(
-            "--format",
-            type=str,
-            default="infer",
-            choices=PipelineDataFormat.SUPPORTED_FORMATS,
-            help="Input format to read from",
-        )
-        run_parser.add_argument(
-            "--device",
-            type=int,
-            default=-1,
-            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
-        )
-        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
-        run_parser.set_defaults(func=run_command_factory)
-
-    def run(self):
-        nlp, outputs = self._nlp, []
-
-        for entry in self._reader:
-            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
-            if isinstance(output, dict):
-                outputs.append(output)
-            else:
-                outputs += output
-
-        # Saving data
-        if self._nlp.binary_output:
-            binary_path = self._reader.save_binary(outputs)
-            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
-        else:
-            self._reader.save(outputs)
diff --git a/server/transformers/src/transformers/commands/serving.py b/server/transformers/src/transformers/commands/serving.py
deleted file mode 100644
index f45d0b0987d5ec68f6001351539405912e16337a..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/serving.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import logging
-from argparse import ArgumentParser, Namespace
-from typing import Any, List, Optional
-
-from transformers import Pipeline
-from transformers.commands import BaseTransformersCLICommand
-from transformers.pipelines import SUPPORTED_TASKS, pipeline
-
-
-try:
-    from uvicorn import run
-    from fastapi import FastAPI, HTTPException, Body
-    from fastapi.routing import APIRoute
-    from pydantic import BaseModel
-    from starlette.responses import JSONResponse
-
-    _serve_dependencies_installed = True
-except (ImportError, AttributeError):
-    BaseModel = object
-
-    def Body(*x, **y):
-        pass
-
-    _serve_dependencies_installed = False
-
-
-logger = logging.getLogger("transformers-cli/serving")
-
-
-def serve_command_factory(args: Namespace):
-    """
-    Factory function used to instantiate serving server from provided command line arguments.
-    :return: ServeCommand
-    """
-    nlp = pipeline(
-        task=args.task,
-        model=args.model if args.model else None,
-        config=args.config,
-        tokenizer=args.tokenizer,
-        device=args.device,
-    )
-    return ServeCommand(nlp, args.host, args.port, args.workers)
-
-
-class ServeModelInfoResult(BaseModel):
-    """
-    Expose model information
-    """
-
-    infos: dict
-
-
-class ServeTokenizeResult(BaseModel):
-    """
-    Tokenize result model
-    """
-
-    tokens: List[str]
-    tokens_ids: Optional[List[int]]
-
-
-class ServeDeTokenizeResult(BaseModel):
-    """
-    DeTokenize result model
-    """
-
-    text: str
-
-
-class ServeForwardResult(BaseModel):
-    """
-    Forward result model
-    """
-
-    output: Any
-
-
-class ServeCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        """
-        Register this command to argparse so it's available for the transformer-cli
-        :param parser: Root parser to register command-specific arguments
-        :return:
-        """
-        serve_parser = parser.add_parser(
-            "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
-        )
-        serve_parser.add_argument(
-            "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
-        )
-        serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
-        serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
-        serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers")
-        serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
-        serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
-        serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
-        serve_parser.add_argument(
-            "--device",
-            type=int,
-            default=-1,
-            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
-        )
-        serve_parser.set_defaults(func=serve_command_factory)
-
-    def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
-
-        self._pipeline = pipeline
-
-        self.host = host
-        self.port = port
-        self.workers = workers
-
-        if not _serve_dependencies_installed:
-            raise RuntimeError(
-                "Using serve command requires FastAPI and unicorn. "
-                'Please install transformers with [serving]: pip install "transformers[serving]".'
-                "Or install FastAPI and unicorn separately."
-            )
-        else:
-            logger.info("Serving model over {}:{}".format(host, port))
-            self._app = FastAPI(
-                routes=[
-                    APIRoute(
-                        "/",
-                        self.model_info,
-                        response_model=ServeModelInfoResult,
-                        response_class=JSONResponse,
-                        methods=["GET"],
-                    ),
-                    APIRoute(
-                        "/tokenize",
-                        self.tokenize,
-                        response_model=ServeTokenizeResult,
-                        response_class=JSONResponse,
-                        methods=["POST"],
-                    ),
-                    APIRoute(
-                        "/detokenize",
-                        self.detokenize,
-                        response_model=ServeDeTokenizeResult,
-                        response_class=JSONResponse,
-                        methods=["POST"],
-                    ),
-                    APIRoute(
-                        "/forward",
-                        self.forward,
-                        response_model=ServeForwardResult,
-                        response_class=JSONResponse,
-                        methods=["POST"],
-                    ),
-                ],
-                timeout=600,
-            )
-
-    def run(self):
-        run(self._app, host=self.host, port=self.port, workers=self.workers)
-
-    def model_info(self):
-        return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
-
-    def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
-        """
-        Tokenize the provided input and eventually returns corresponding tokens id:
-        - **text_input**: String to tokenize
-        - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
-        """
-        try:
-            tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
-
-            if return_ids:
-                tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
-                return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
-            else:
-                return ServeTokenizeResult(tokens=tokens_txt)
-
-        except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
-
-    def detokenize(
-        self,
-        tokens_ids: List[int] = Body(None, embed=True),
-        skip_special_tokens: bool = Body(False, embed=True),
-        cleanup_tokenization_spaces: bool = Body(True, embed=True),
-    ):
-        """
-        Detokenize the provided tokens ids to readable text:
-        - **tokens_ids**: List of tokens ids
-        - **skip_special_tokens**: Flag indicating to not try to decode special tokens
-        - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
-        """
-        try:
-            decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
-            return ServeDeTokenizeResult(model="", text=decoded_str)
-        except Exception as e:
-            raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
-
-    async def forward(self, inputs=Body(None, embed=True)):
-        """
-        **inputs**:
-        **attention_mask**:
-        **tokens_type_ids**:
-        """
-
-        # Check we don't have empty string
-        if len(inputs) == 0:
-            return ServeForwardResult(output=[], attention=[])
-
-        try:
-            # Forward through the model
-            output = self._pipeline(inputs)
-            return ServeForwardResult(output=output)
-        except Exception as e:
-            raise HTTPException(500, {"error": str(e)})
diff --git a/server/transformers/src/transformers/commands/train.py b/server/transformers/src/transformers/commands/train.py
deleted file mode 100644
index afa035c9401d57221c02a4dd87069488c9435184..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/train.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-from argparse import ArgumentParser, Namespace
-from logging import getLogger
-
-from transformers import SingleSentenceClassificationProcessor as Processor
-from transformers import TextClassificationPipeline, is_tf_available, is_torch_available
-from transformers.commands import BaseTransformersCLICommand
-
-
-if not is_tf_available() and not is_torch_available():
-    raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
-
-# TF training parameters
-USE_XLA = False
-USE_AMP = False
-
-
-def train_command_factory(args: Namespace):
-    """
-    Factory function used to instantiate serving server from provided command line arguments.
-    :return: ServeCommand
-    """
-    return TrainCommand(args)
-
-
-class TrainCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        """
-        Register this command to argparse so it's available for the transformer-cli
-        :param parser: Root parser to register command-specific arguments
-        :return:
-        """
-        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
-
-        train_parser.add_argument(
-            "--train_data",
-            type=str,
-            required=True,
-            help="path to train (and optionally evaluation) dataset as a csv with "
-            "tab separated labels and sentences.",
-        )
-        train_parser.add_argument(
-            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
-        )
-        train_parser.add_argument(
-            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
-        )
-        train_parser.add_argument(
-            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
-        )
-        train_parser.add_argument(
-            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
-        )
-
-        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
-        train_parser.add_argument(
-            "--validation_split",
-            type=float,
-            default=0.1,
-            help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
-        )
-
-        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
-
-        train_parser.add_argument(
-            "--task", type=str, default="text_classification", help="Task to train the model on."
-        )
-        train_parser.add_argument(
-            "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
-        )
-        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
-        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
-        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
-        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
-        train_parser.set_defaults(func=train_command_factory)
-
-    def __init__(self, args: Namespace):
-        self.logger = getLogger("transformers-cli/training")
-
-        self.framework = "tf" if is_tf_available() else "torch"
-
-        os.makedirs(args.output, exist_ok=True)
-        assert os.path.isdir(args.output)
-        self.output = args.output
-
-        self.column_label = args.column_label
-        self.column_text = args.column_text
-        self.column_id = args.column_id
-
-        self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
-        if args.task == "text_classification":
-            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
-        elif args.task == "token_classification":
-            raise NotImplementedError
-        elif args.task == "question_answering":
-            raise NotImplementedError
-
-        self.logger.info("Loading dataset from {}".format(args.train_data))
-        self.train_dataset = Processor.create_from_csv(
-            args.train_data,
-            column_label=args.column_label,
-            column_text=args.column_text,
-            column_id=args.column_id,
-            skip_first_row=args.skip_first_row,
-        )
-        self.valid_dataset = None
-        if args.validation_data:
-            self.logger.info("Loading validation dataset from {}".format(args.validation_data))
-            self.valid_dataset = Processor.create_from_csv(
-                args.validation_data,
-                column_label=args.column_label,
-                column_text=args.column_text,
-                column_id=args.column_id,
-                skip_first_row=args.skip_first_row,
-            )
-
-        self.validation_split = args.validation_split
-        self.train_batch_size = args.train_batch_size
-        self.valid_batch_size = args.valid_batch_size
-        self.learning_rate = args.learning_rate
-        self.adam_epsilon = args.adam_epsilon
-
-    def run(self):
-        if self.framework == "tf":
-            return self.run_tf()
-        return self.run_torch()
-
-    def run_torch(self):
-        raise NotImplementedError
-
-    def run_tf(self):
-        self.pipeline.fit(
-            self.train_dataset,
-            validation_data=self.valid_dataset,
-            validation_split=self.validation_split,
-            learning_rate=self.learning_rate,
-            adam_epsilon=self.adam_epsilon,
-            train_batch_size=self.train_batch_size,
-            valid_batch_size=self.valid_batch_size,
-        )
-
-        # Save trained pipeline
-        self.pipeline.save_pretrained(self.output)
diff --git a/server/transformers/src/transformers/commands/user.py b/server/transformers/src/transformers/commands/user.py
deleted file mode 100644
index 47c7860114b34b3996dfc1f11fc19b384d3bf8c9..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/commands/user.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import os
-import sys
-from argparse import ArgumentParser
-from getpass import getpass
-from typing import List, Union
-
-from requests.exceptions import HTTPError
-
-from transformers.commands import BaseTransformersCLICommand
-from transformers.hf_api import HfApi, HfFolder
-
-
-UPLOAD_MAX_FILES = 15
-
-
-class UserCommands(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co")
-        login_parser.set_defaults(func=lambda args: LoginCommand(args))
-        whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.")
-        whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
-        logout_parser = parser.add_parser("logout", help="Log out")
-        logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
-        # s3
-        s3_parser = parser.add_parser("s3", help="{ls, rm} Commands to interact with the files you upload on S3.")
-        s3_subparsers = s3_parser.add_subparsers(help="s3 related commands")
-        ls_parser = s3_subparsers.add_parser("ls")
-        ls_parser.set_defaults(func=lambda args: ListObjsCommand(args))
-        rm_parser = s3_subparsers.add_parser("rm")
-        rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.")
-        rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args))
-        # upload
-        upload_parser = parser.add_parser("upload")
-        upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.")
-        upload_parser.add_argument(
-            "--filename", type=str, default=None, help="Optional: override individual object filename on S3."
-        )
-        upload_parser.set_defaults(func=lambda args: UploadCommand(args))
-
-
-class ANSI:
-    """
-    Helper for en.wikipedia.org/wiki/ANSI_escape_code
-    """
-
-    _bold = "\u001b[1m"
-    _reset = "\u001b[0m"
-
-    @classmethod
-    def bold(cls, s):
-        return "{}{}{}".format(cls._bold, s, cls._reset)
-
-
-class BaseUserCommand:
-    def __init__(self, args):
-        self.args = args
-        self._api = HfApi()
-
-
-class LoginCommand(BaseUserCommand):
-    def run(self):
-        print(
-            """
-        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
-        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
-        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
-        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
-        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
-
-        """
-        )
-        username = input("Username: ")
-        password = getpass()
-        try:
-            token = self._api.login(username, password)
-        except HTTPError as e:
-            # probably invalid credentials, display error message.
-            print(e)
-            exit(1)
-        HfFolder.save_token(token)
-        print("Login successful")
-        print("Your token:", token, "\n")
-        print("Your token has been saved to", HfFolder.path_token)
-
-
-class WhoamiCommand(BaseUserCommand):
-    def run(self):
-        token = HfFolder.get_token()
-        if token is None:
-            print("Not logged in")
-            exit()
-        try:
-            user = self._api.whoami(token)
-            print(user)
-        except HTTPError as e:
-            print(e)
-
-
-class LogoutCommand(BaseUserCommand):
-    def run(self):
-        token = HfFolder.get_token()
-        if token is None:
-            print("Not logged in")
-            exit()
-        HfFolder.delete_token()
-        self._api.logout(token)
-        print("Successfully logged out.")
-
-
-class ListObjsCommand(BaseUserCommand):
-    def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str:
-        """
-        Inspired by:
-        stackoverflow.com/a/8356620/593036
-        stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
-        """
-        col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
-        row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
-        lines = []
-        lines.append(row_format.format(*headers))
-        lines.append(row_format.format(*["-" * w for w in col_widths]))
-        for row in rows:
-            lines.append(row_format.format(*row))
-        return "\n".join(lines)
-
-    def run(self):
-        token = HfFolder.get_token()
-        if token is None:
-            print("Not logged in")
-            exit(1)
-        try:
-            objs = self._api.list_objs(token)
-        except HTTPError as e:
-            print(e)
-            exit(1)
-        if len(objs) == 0:
-            print("No shared file yet")
-            exit()
-        rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs]
-        print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]))
-
-
-class DeleteObjCommand(BaseUserCommand):
-    def run(self):
-        token = HfFolder.get_token()
-        if token is None:
-            print("Not logged in")
-            exit(1)
-        try:
-            self._api.delete_obj(token, filename=self.args.filename)
-        except HTTPError as e:
-            print(e)
-            exit(1)
-        print("Done")
-
-
-class UploadCommand(BaseUserCommand):
-    def walk_dir(self, rel_path):
-        """
-        Recursively list all files in a folder.
-        """
-        entries: List[os.DirEntry] = list(os.scandir(rel_path))
-        files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()]  # (filepath, filename)
-        for f in entries:
-            if f.is_dir():
-                files += self.walk_dir(f.path)
-        return files
-
-    def run(self):
-        token = HfFolder.get_token()
-        if token is None:
-            print("Not logged in")
-            exit(1)
-        local_path = os.path.abspath(self.args.path)
-        if os.path.isdir(local_path):
-            if self.args.filename is not None:
-                raise ValueError("Cannot specify a filename override when uploading a folder.")
-            rel_path = os.path.basename(local_path)
-            files = self.walk_dir(rel_path)
-        elif os.path.isfile(local_path):
-            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
-            files = [(local_path, filename)]
-        else:
-            raise ValueError("Not a valid file or directory: {}".format(local_path))
-
-        if sys.platform == "win32":
-            files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files]
-
-        if len(files) > UPLOAD_MAX_FILES:
-            print(
-                "About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format(
-                    ANSI.bold(len(files))
-                )
-            )
-            exit(1)
-
-        for filepath, filename in files:
-            print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename)))
-
-        choice = input("Proceed? [Y/n] ").lower()
-        if not (choice == "" or choice == "y" or choice == "yes"):
-            print("Abort")
-            exit()
-        print(ANSI.bold("Uploading... This might take a while if files are large"))
-        for filepath, filename in files:
-            access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath)
-            print("Your file now lives at:")
-            print(access_url)
diff --git a/server/transformers/src/transformers/configuration_albert.py b/server/transformers/src/transformers/configuration_albert.py
deleted file mode 100644
index 3419753cb1ff1065978b5eead21ce10e64706a1d..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_albert.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" ALBERT model configuration """
-
-from .configuration_utils import PretrainedConfig
-
-
-ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
-    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
-    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
-    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
-    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
-    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
-    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
-    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
-}
-
-
-class AlbertConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
-        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30000):
-                Vocabulary size of the ALBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of vocabulary embeddings.
-            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_hidden_groups (:obj:`int`, optional, defaults to 1):
-                Number of groups for the hidden layers, parameters in the same group are shared.
-            num_attention_heads (:obj:`int`, optional, defaults to 64):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            inner_group_num (:obj:`int`, optional, defaults to 1):
-                The number of inner repetition of attention and ffn.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with. Typically set this to something
-                large (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for attached classifiers.
-
-        Example::
-
-            from transformers import AlbertConfig, AlbertModel
-            # Initializing an ALBERT-xxlarge style configuration
-            albert_xxlarge_configuration = AlbertConfig()
-
-            # Initializing an ALBERT-base style configuration
-            albert_base_configuration = AlbertConfig(
-                hidden_size=768,
-                num_attention_heads=12,
-                intermediate_size=3072,
-            )
-
-            # Initializing a model from the ALBERT-base style configuration
-            model = AlbertModel(albert_xxlarge_configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "albert"
-
-    def __init__(
-        self,
-        vocab_size=30000,
-        embedding_size=128,
-        hidden_size=4096,
-        num_hidden_layers=12,
-        num_hidden_groups=1,
-        num_attention_heads=64,
-        intermediate_size=16384,
-        inner_group_num=1,
-        hidden_act="gelu_new",
-        hidden_dropout_prob=0,
-        attention_probs_dropout_prob=0,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        classifier_dropout_prob=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_hidden_groups = num_hidden_groups
-        self.num_attention_heads = num_attention_heads
-        self.inner_group_num = inner_group_num
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.classifier_dropout_prob = classifier_dropout_prob
diff --git a/server/transformers/src/transformers/configuration_auto.py b/server/transformers/src/transformers/configuration_auto.py
deleted file mode 100644
index 4fd23fee26019594b9636ea5ed8ba7804d3ace95..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_auto.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Auto Config class. """
-
-
-import logging
-from collections import OrderedDict
-
-from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
-from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
-from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
-from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
-from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
-from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
-from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
-from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
-from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
-from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-from .configuration_utils import PretrainedConfig
-from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
-from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
-from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
-    (key, value)
-    for pretrained_map in [
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ]
-    for key, value, in pretrained_map.items()
-)
-
-
-CONFIG_MAPPING = OrderedDict(
-    [
-        ("t5", T5Config,),
-        ("distilbert", DistilBertConfig,),
-        ("albert", AlbertConfig,),
-        ("camembert", CamembertConfig,),
-        ("xlm-roberta", XLMRobertaConfig,),
-        ("roberta", RobertaConfig,),
-        ("flaubert", FlaubertConfig,),
-        ("bert", BertConfig,),
-        ("openai-gpt", OpenAIGPTConfig,),
-        ("gpt2", GPT2Config,),
-        ("transfo-xl", TransfoXLConfig,),
-        ("xlnet", XLNetConfig,),
-        ("xlm", XLMConfig,),
-        ("ctrl", CTRLConfig,),
-    ]
-)
-
-
-class AutoConfig:
-    r"""
-        :class:`~transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
-
-        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoConfig is designed to be instantiated "
-            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
-        )
-
-    @classmethod
-    def for_model(cls, model_type, *args, **kwargs):
-        for pattern, config_class in CONFIG_MAPPING.items():
-            if pattern in model_type:
-                return config_class(*args, **kwargs)
-        raise ValueError(
-            "Unrecognized model identifier in {}. Should contain one of {}".format(
-                model_type, ", ".join(CONFIG_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiates one of the configuration classes of the library
-        from a pre-trained model configuration.
-
-        The configuration class to instantiate is selected
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-            - contains `t5`: :class:`~transformers.T5Config` (T5 model)
-            - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
-            - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
-            - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
-            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
-            - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
-            - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
-            - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
-            - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
-            - contains `flaubert` : :class:`~transformers.FlaubertConfig` (Flaubert model)
-
-
-        Args:
-            pretrained_model_name_or_path (:obj:`string`):
-                Is either: \
-                    - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                    - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                    - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                    - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
-
-            cache_dir (:obj:`string`, optional, defaults to `None`):
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download (:obj:`boolean`, optional, defaults to `False`):
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
-
-            resume_download (:obj:`boolean`, optional, defaults to `False`):
-                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
-
-            proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
-                The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
-
-            return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
-                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-
-            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
-
-
-        Examples::
-
-            config = AutoConfig.from_pretrained('bert-base-uncased')  # Download configuration from S3 and cache.
-            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
-            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        config_dict, _ = PretrainedConfig.get_config_dict(
-            pretrained_model_name_or_path, pretrained_config_archive_map=ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, **kwargs
-        )
-
-        if "model_type" in config_dict:
-            config_class = CONFIG_MAPPING[config_dict["model_type"]]
-            return config_class.from_dict(config_dict, **kwargs)
-        else:
-            # Fallback: use pattern matching on the string.
-            for pattern, config_class in CONFIG_MAPPING.items():
-                if pattern in pretrained_model_name_or_path:
-                    return config_class.from_dict(config_dict, **kwargs)
-
-        raise ValueError(
-            "Unrecognized model in {}. "
-            "Should have a `model_type` key in its config.json, or contain one of the following strings "
-            "in its name: {}".format(pretrained_model_name_or_path, ", ".join(CONFIG_MAPPING.keys()))
-        )
diff --git a/server/transformers/src/transformers/configuration_bert.py b/server/transformers/src/transformers/configuration_bert.py
deleted file mode 100644
index d668d04cb8ee19207f9ec7c6695365503a477b87..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_bert.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" BERT model configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
-    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
-}
-
-
-class BertConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-
-        Example::
-
-            from transformers import BertModel, BertConfig
-
-            # Initializing a BERT bert-base-uncased style configuration
-            configuration = BertConfig()
-
-            # Initializing a model from the bert-base-uncased style configuration
-            model = BertModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "bert"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
diff --git a/server/transformers/src/transformers/configuration_camembert.py b/server/transformers/src/transformers/configuration_camembert.py
deleted file mode 100644
index f930fe2ece43706ece61d5f135088c3e7e89e7bb..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_camembert.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" CamemBERT configuration """
-
-
-import logging
-
-from .configuration_roberta import RobertaConfig
-
-
-logger = logging.getLogger(__name__)
-
-CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
-    "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
-    "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
-}
-
-
-class CamembertConfig(RobertaConfig):
-    """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "camembert"
diff --git a/server/transformers/src/transformers/configuration_ctrl.py b/server/transformers/src/transformers/configuration_ctrl.py
deleted file mode 100644
index 4daba2a97ab1578b0bdfbcf674e4cf3ebe28cb3d..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_ctrl.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Salesforce CTRL configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
-
-
-class CTRLConfig(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
-        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 246534):
-                Vocabulary size of the CTRL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 256):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 1280):
-                Dimensionality of the embeddings and hidden states.
-            dff (:obj:`int`, optional, defaults to 8192):
-                Dimensionality of the inner dimension of the FFN.
-            n_layer (:obj:`int`, optional, defaults to 48):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-
-        Example::
-
-            from transformers import CTRLModel, CTRLConfig
-
-            # Initializing a CTRL configuration
-            configuration = CTRLConfig()
-
-            # Initializing a model from the configuration
-            model = CTRLModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "ctrl"
-
-    def __init__(
-        self,
-        vocab_size=246534,
-        n_positions=256,
-        n_ctx=256,
-        n_embd=1280,
-        dff=8192,
-        n_layer=48,
-        n_head=16,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.dff = dff
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/src/transformers/configuration_distilbert.py b/server/transformers/src/transformers/configuration_distilbert.py
deleted file mode 100644
index b3386e0ab81c6115d641f8b50f7fca70a1bfe212..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_distilbert.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" DistilBERT model configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
-    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
-    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
-    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
-    "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",
-}
-
-
-class DistilBertConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
-        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the DistilBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings.
-            n_layers (:obj:`int`, optional, defaults to 6):
-                Number of hidden layers in the Transformer encoder.
-            n_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dim (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            qa_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilities used in the question answering model
-                :class:`~tranformers.DistilBertForQuestionAnswering`.
-            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
-                The dropout probabilities used in the sequence classification model
-                :class:`~tranformers.DistilBertForSequenceClassification`.
-
-        Example::
-
-            from transformers import DistilBertModel, DistilBertConfig
-
-            # Initializing a DistilBERT configuration
-            configuration = DistilBertConfig()
-
-            # Initializing a model from the configuration
-            model = DistilBertModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "distilbert"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        max_position_embeddings=512,
-        sinusoidal_pos_embds=False,
-        n_layers=6,
-        n_heads=12,
-        dim=768,
-        hidden_dim=4 * 768,
-        dropout=0.1,
-        attention_dropout=0.1,
-        activation="gelu",
-        initializer_range=0.02,
-        qa_dropout=0.1,
-        seq_classif_dropout=0.2,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.sinusoidal_pos_embds = sinusoidal_pos_embds
-        self.n_layers = n_layers
-        self.n_heads = n_heads
-        self.dim = dim
-        self.hidden_dim = hidden_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation = activation
-        self.initializer_range = initializer_range
-        self.qa_dropout = qa_dropout
-        self.seq_classif_dropout = seq_classif_dropout
-
-    @property
-    def hidden_size(self):
-        return self.dim
-
-    @property
-    def num_attention_heads(self):
-        return self.n_heads
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layers
diff --git a/server/transformers/src/transformers/configuration_flaubert.py b/server/transformers/src/transformers/configuration_flaubert.py
deleted file mode 100644
index 511033081996d6d794ff86ecde0e1ca106a9e283..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_flaubert.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Flaubert configuration, based on XLM. """
-
-
-import logging
-
-from .configuration_xlm import XLMConfig
-
-
-logger = logging.getLogger(__name__)
-
-FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/config.json",
-    "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/config.json",
-    "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/config.json",
-    "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/config.json",
-}
-
-
-class FlaubertConfig(XLMConfig):
-    """
-        Configuration class to store the configuration of a `FlaubertModel`.
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
-                with Structured Dropout. ICLR 2020)
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
-    """
-
-    pretrained_config_archive_map = FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "flaubert"
-
-    def __init__(self, layerdrop=0.0, pre_norm=False, **kwargs):
-        """Constructs FlaubertConfig.
-        """
-        super().__init__(**kwargs)
-        self.layerdrop = layerdrop
-        self.pre_norm = pre_norm
diff --git a/server/transformers/src/transformers/configuration_gpt2.py b/server/transformers/src/transformers/configuration_gpt2.py
deleted file mode 100644
index 7fff0b6c4918f08b4817b3aa0fb16a0723db2de0..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_gpt2.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT-2 configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
-    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
-    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",
-}
-
-
-class GPT2Config(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 50257):
-                Vocabulary size of the GPT-2 model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-            n_positions (:obj:`int`, optional, defaults to 1024):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 16):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            from transformers import GPT2Model, GPT2Config
-
-            # Initializing a GPT2 configuration
-            configuration = GPT2Config()
-
-            # Initializing a model from the configuration
-            model = GPT2Model(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "gpt2"
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        n_positions=1024,
-        n_ctx=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/src/transformers/configuration_mmbt.py b/server/transformers/src/transformers/configuration_mmbt.py
deleted file mode 100644
index 56a35e237c07400fe714940d9c85b0700893fbd1..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_mmbt.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" MMBT configuration """
-
-
-import logging
-
-
-logger = logging.getLogger(__name__)
-
-
-class MMBTConfig(object):
-    """Configuration class to store the configuration of a `MMBT Model`.
-
-    Args:
-        config (:obj:`~transformers.PreTrainedConfig`):
-            Config of the underlying Transformer models. Its values are
-            copied over to use a single config.
-        num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
-            Size of final Linear layer for classification.
-        modal_hidden_size (:obj:`int`, optional, defautls to 2048):
-            Embedding dimension of the non-text modality encoder.
-    """
-
-    def __init__(self, config, num_labels=None, modal_hidden_size=2048):
-        self.__dict__ = config.__dict__
-        self.modal_hidden_size = modal_hidden_size
-        if num_labels:
-            self.num_labels = num_labels
diff --git a/server/transformers/src/transformers/configuration_openai.py b/server/transformers/src/transformers/configuration_openai.py
deleted file mode 100644
index d4a965bde14eadacc9665521fe300373f9ccf688..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_openai.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
-}
-
-
-class OpenAIGPTConfig(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
-        It is used to instantiate an GPT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 40478):
-                Vocabulary size of the GPT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether special tokens should be predicted when the model is has a language modeling head.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            from transformers import OpenAIGPTConfig, OpenAIGPTModel
-
-            # Initializing a GPT configuration
-            configuration = OpenAIGPTConfig()
-
-            # Initializing a model from the configuration
-            model = OpenAIGPTModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "openai-gpt"
-
-    def __init__(
-        self,
-        vocab_size=40478,
-        n_positions=512,
-        n_ctx=512,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        afn="gelu",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        predict_special_tokens=True,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.afn = afn
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.predict_special_tokens = predict_special_tokens
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/src/transformers/configuration_roberta.py b/server/transformers/src/transformers/configuration_roberta.py
deleted file mode 100644
index 655fe03b71424a64c009f5c4a289ec23ca5ed354..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_roberta.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" RoBERTa configuration """
-
-
-import logging
-
-from .configuration_bert import BertConfig
-
-
-logger = logging.getLogger(__name__)
-
-ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
-    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
-    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
-    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
-    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
-    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
-}
-
-
-class RobertaConfig(BertConfig):
-    r"""
-        This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
-        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
-
-        Example::
-
-            from transformers import RobertaConfig, RobertaModel
-
-            # Initializing a RoBERTa configuration
-            configuration = RobertaConfig()
-
-            # Initializing a model from the configuration
-            model = RobertaModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "roberta"
diff --git a/server/transformers/src/transformers/configuration_t5.py b/server/transformers/src/transformers/configuration_t5.py
deleted file mode 100644
index 39dd7b4e249bf70428d6209d03ad6696f23faf89..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_t5.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# coding=utf-8
-# Copyright 2010, The T5 Authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" T5 model configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
-    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
-    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
-    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
-    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
-}
-
-
-class T5Config(PretrainedConfig):
-    r"""
-        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
-        `T5Model`.
-
-
-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `T5Model`.
-            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
-            layer_norm_eps: The epsilon used by LayerNorm.
-    """
-    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "t5"
-
-    def __init__(
-        self,
-        vocab_size=32128,
-        n_positions=512,
-        d_model=512,
-        d_kv=64,
-        d_ff=2048,
-        num_layers=6,
-        num_heads=8,
-        relative_attention_num_buckets=32,
-        dropout_rate=0.1,
-        layer_norm_epsilon=1e-6,
-        initializer_factor=1.0,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.d_model = d_model
-        self.d_kv = d_kv
-        self.d_ff = d_ff
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_factor = initializer_factor
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.d_model
-
-    @property
-    def num_attention_heads(self):
-        return self.num_heads
-
-    @property
-    def num_hidden_layers(self):
-        return self.num_layers
diff --git a/server/transformers/src/transformers/configuration_transfo_xl.py b/server/transformers/src/transformers/configuration_transfo_xl.py
deleted file mode 100644
index ebcc4af4f74de5e0efd13a886b38f79e47b6fbd6..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_transfo_xl.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Transformer XL configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
-}
-
-
-class TransfoXLConfig(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
-        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 267735):
-                Vocabulary size of the Transformer XL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-                Cutoffs for the adaptive softmax
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the model's hidden states.
-            d_embed (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the embeddings
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_head (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the model's heads.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Inner dimension in FF
-            div_val (:obj:`int`, optional, defaults to 4):
-                Divident value for adapative input and softmax
-            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Apply LayerNorm to the input instead of the output
-            n_layer (:obj:`int`, optional, defaults to 18):
-                Number of hidden layers in the Transformer encoder.
-            tgt_len (:obj:`int`, optional, defaults to 128):
-                Number of tokens to predict
-            ext_len (:obj:`int`, optional, defaults to 0):
-                Length of the extended context
-            mem_len (:obj:`int`, optional, defaults to 1600):
-                Length of the retained previous heads
-            clamp_len (:obj:`int`, optional, defaults to 1000):
-                use the same pos embeddings after clamp_len
-            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Use the same attn length for all tokens
-            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
-                True to share all but first projs, False not to share.
-            attn_type (:obj:`int`, optional, defaults to 0):
-                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            sample_softmax (:obj:`int`, optional, defaults to -1):
-                number of samples in sampled softmax
-            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
-                use adaptive softmax
-            tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
-                tie the word embedding and softmax weights
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            dropatt (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            init (:obj:`string`, optional, defaults to `normal`):
-                Parameter initializer to use
-            init_range (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by U(-init_range, init_range).
-            proj_init_std (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by N(0, init_std)
-            init_std (:obj:`float`, optional, defaults to 0.02):
-                Parameters initialized by N(0, init_std)
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-
-        Example::
-
-            from transformers import TransfoXLConfig, TransfoXLModel
-
-            # Initializing a Transformer XL configuration
-            configuration = TransfoXLConfig()
-
-            # Initializing a model from the configuration
-            model = TransfoXLModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "transfo-xl"
-
-    def __init__(
-        self,
-        vocab_size=267735,
-        cutoffs=[20000, 40000, 200000],
-        d_model=1024,
-        d_embed=1024,
-        n_head=16,
-        d_head=64,
-        d_inner=4096,
-        div_val=4,
-        pre_lnorm=False,
-        n_layer=18,
-        tgt_len=128,
-        ext_len=0,
-        mem_len=1600,
-        clamp_len=1000,
-        same_length=True,
-        proj_share_all_but_first=True,
-        attn_type=0,
-        sample_softmax=-1,
-        adaptive=True,
-        tie_weight=True,
-        dropout=0.1,
-        dropatt=0.0,
-        untie_r=True,
-        init="normal",
-        init_range=0.01,
-        proj_init_std=0.01,
-        init_std=0.02,
-        layer_norm_epsilon=1e-5,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.cutoffs = []
-        self.cutoffs.extend(cutoffs)
-        self.tie_weight = tie_weight
-        if proj_share_all_but_first:
-            self.tie_projs = [False] + [True] * len(self.cutoffs)
-        else:
-            self.tie_projs = [False] + [False] * len(self.cutoffs)
-        self.d_model = d_model
-        self.d_embed = d_embed
-        self.d_head = d_head
-        self.d_inner = d_inner
-        self.div_val = div_val
-        self.pre_lnorm = pre_lnorm
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.tgt_len = tgt_len
-        self.ext_len = ext_len
-        self.mem_len = mem_len
-        self.same_length = same_length
-        self.attn_type = attn_type
-        self.clamp_len = clamp_len
-        self.sample_softmax = sample_softmax
-        self.adaptive = adaptive
-        self.dropout = dropout
-        self.dropatt = dropatt
-        self.untie_r = untie_r
-        self.init = init
-        self.init_range = init_range
-        self.proj_init_std = proj_init_std
-        self.init_std = init_std
-        self.layer_norm_epsilon = layer_norm_epsilon
-
-    @property
-    def max_position_embeddings(self):
-        return self.tgt_len + self.ext_len + self.mem_len
-
-    @property
-    def n_token(self):  # Backward compatibility
-        return self.vocab_size
-
-    @n_token.setter
-    def n_token(self, value):  # Backward compatibility
-        self.vocab_size = value
-
-    @property
-    def hidden_size(self):
-        return self.d_model
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/src/transformers/configuration_utils.py b/server/transformers/src/transformers/configuration_utils.py
deleted file mode 100644
index 97b68ce16d5fd1f9de98b6f40a7686fa34d52e08..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_utils.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Configuration base class and utilities."""
-
-
-import copy
-import json
-import logging
-import os
-from typing import Dict, Optional, Tuple
-
-from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
-
-
-logger = logging.getLogger(__name__)
-
-
-class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
-
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
-
-        Class attributes (overridden by derived classes):
-            - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
-            - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
-
-        Args:
-            finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
-                Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            num_labels (:obj:`int`, `optional`, defaults to `2`):
-                Number of classes to use when the model is a classification model (sequences/tokens)
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Should the model returns attentions weights.
-            output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
-                Should the model returns all hidden-states.
-            torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Is the model used with Torchscript (for PyTorch models).
-    """
-    pretrained_config_archive_map = {}  # type: Dict[str, str]
-    model_type = ""  # type: str
-
-    def __init__(self, **kwargs):
-        # Attributes with defaults
-        self.output_attentions = kwargs.pop("output_attentions", False)
-        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
-        self.output_additional_info = kwargs.pop("output_additional_info", False)
-        self.output_past = kwargs.pop("output_past", True)  # Not used by all models
-        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-        self.pruned_heads = kwargs.pop("pruned_heads", {})
-
-        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_decoder = kwargs.pop("is_decoder", False)
-
-        # Parameters for sequence generation
-        self.max_length = kwargs.pop("max_length", 20)
-        self.do_sample = kwargs.pop("do_sample", False)
-        self.num_beams = kwargs.pop("num_beams", 1)
-        self.temperature = kwargs.pop("temperature", 1.0)
-        self.top_k = kwargs.pop("top_k", 50)
-        self.top_p = kwargs.pop("top_p", 1.0)
-        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
-        self.bos_token_id = kwargs.pop("bos_token_id", 0)
-        self.pad_token_id = kwargs.pop("pad_token_id", 0)
-        self.eos_token_ids = kwargs.pop("eos_token_ids", 0)
-        self.length_penalty = kwargs.pop("length_penalty", 1.0)
-        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
-
-        # Fine-tuning task arguments
-        self.architectures = kwargs.pop("architectures", None)
-        self.finetuning_task = kwargs.pop("finetuning_task", None)
-        self.num_labels = kwargs.pop("num_labels", 2)
-        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
-        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
-        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
-
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error("Can't set {} with value {} for {}".format(key, value, self))
-                raise err
-
-    def save_pretrained(self, save_directory):
-        """
-        Save a configuration object to the directory `save_directory`, so that it
-        can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
-
-        Args:
-            save_directory (:obj:`string`):
-                Directory where the configuration JSON file will be saved.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file)
-        logger.info("Configuration saved in {}".format(output_config_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
-        r"""
-
-        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
-
-        Args:
-            pretrained_model_name_or_path (:obj:`string`):
-                either:
-                  - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
-                    download, e.g.: ``bert-base-uncased``.
-                  - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
-                    our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                  - a path to a `directory` containing a configuration file saved using the
-                    :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                  - a path or url to a saved configuration JSON `file`, e.g.:
-                    ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`string`, `optional`):
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            kwargs (:obj:`Dict[str, any]`, `optional`):
-                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
-                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-            proxies (:obj:`Dict`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.:
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
-                The proxies are used on each request.
-            return_unused_kwargs: (`optional`) bool:
-                If False, then this function returns just the final configuration object.
-                If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
-                dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
-                of kwargs which has not been used to update `config` and is otherwise ignored.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-
-        Examples::
-
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls.from_dict(config_dict, **kwargs)
-
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
-    ) -> Tuple[Dict, Dict]:
-        """
-        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
-        for instantiating a Config using `from_dict`.
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`string`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-            pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
-                A map of `shortcut names` to `url`. By default, will use the current class attribute.
-
-        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
-
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-
-        if pretrained_config_archive_map is None:
-            pretrained_config_archive_map = cls.pretrained_config_archive_map
-
-        if pretrained_model_name_or_path in pretrained_config_archive_map:
-            config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-            )
-            # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
-            config_dict = cls._dict_from_json_file(resolved_config_file)
-
-        except EnvironmentError:
-            if pretrained_model_name_or_path in pretrained_config_archive_map:
-                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                    config_file
-                )
-            else:
-                msg = (
-                    "Model name '{}' was not found in model name list. "
-                    "We assumed '{}' was a path, a model identifier, or url to a configuration file named {} or "
-                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
-                        pretrained_model_name_or_path, config_file, CONFIG_NAME,
-                    )
-                )
-            raise EnvironmentError(msg)
-
-        except json.JSONDecodeError:
-            msg = (
-                "Couldn't reach server at '{}' to download configuration file or "
-                "configuration file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
-            )
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
-
-        return config_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from a Python dictionary of parameters.
-
-        Args:
-            config_dict (:obj:`Dict[str, any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
-                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
-                method.
-            kwargs (:obj:`Dict[str, any]`):
-                Additional parameters from which to initialize the configuration object.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        config = cls(**config_dict)
-
-        if hasattr(config, "pruned_heads"):
-            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config %s", str(config))
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_json_file(cls, json_file: str) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from the path to a json file of parameters.
-
-        Args:
-            json_file (:obj:`string`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-
-        """
-        config_dict = cls._dict_from_json_file(json_file)
-        return cls(**config_dict)
-
-    @classmethod
-    def _dict_from_json_file(cls, json_file: str):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return "{} {}".format(self.__class__.__name__, self.to_json_string())
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if hasattr(self.__class__, "model_type"):
-            output["model_type"] = self.__class__.model_type
-        return output
-
-    def to_json_string(self):
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """
-        Save this instance to a json file.
-
-        Args:
-            json_file_path (:obj:`string`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
diff --git a/server/transformers/src/transformers/configuration_xlm.py b/server/transformers/src/transformers/configuration_xlm.py
deleted file mode 100644
index c4d61808d6ece169c071b944068a899231b9b28f..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_xlm.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" XLM configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
-    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
-    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
-    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
-    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
-    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
-    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
-    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
-    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
-    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
-}
-
-
-class XLMConfig(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
-
-        Example::
-
-            from transformers import XLMConfig, XLMModel
-
-            # Initializing a XLM configuration
-            configuration = XLMConfig()
-
-            # Initializing a model from the configuration
-            model = XLMModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "xlm"
-
-    def __init__(
-        self,
-        vocab_size=30145,
-        emb_dim=2048,
-        n_layers=12,
-        n_heads=16,
-        dropout=0.1,
-        attention_dropout=0.1,
-        gelu_activation=True,
-        sinusoidal_embeddings=False,
-        causal=False,
-        asm=False,
-        n_langs=1,
-        use_lang_emb=True,
-        max_position_embeddings=512,
-        embed_init_std=2048 ** -0.5,
-        layer_norm_eps=1e-12,
-        init_std=0.02,
-        bos_index=0,
-        eos_index=1,
-        pad_index=2,
-        unk_index=3,
-        mask_index=5,
-        is_encoder=True,
-        summary_type="first",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        start_n_top=5,
-        end_n_top=5,
-        mask_token_id=0,
-        lang_id=0,
-        **kwargs
-    ):
-        """Constructs XLMConfig.
-        """
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.emb_dim = emb_dim
-        self.n_layers = n_layers
-        self.n_heads = n_heads
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.gelu_activation = gelu_activation
-        self.sinusoidal_embeddings = sinusoidal_embeddings
-        self.causal = causal
-        self.asm = asm
-        self.n_langs = n_langs
-        self.use_lang_emb = use_lang_emb
-        self.layer_norm_eps = layer_norm_eps
-        self.bos_index = bos_index
-        self.eos_index = eos_index
-        self.pad_index = pad_index
-        self.unk_index = unk_index
-        self.mask_index = mask_index
-        self.is_encoder = is_encoder
-        self.max_position_embeddings = max_position_embeddings
-        self.embed_init_std = embed_init_std
-        self.init_std = init_std
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_proj_to_labels = summary_proj_to_labels
-        self.summary_first_dropout = summary_first_dropout
-        self.start_n_top = start_n_top
-        self.end_n_top = end_n_top
-        self.mask_token_id = mask_token_id
-        self.lang_id = lang_id
-
-        if "n_words" in kwargs:
-            self.n_words = kwargs["n_words"]
-
-    @property
-    def n_words(self):  # For backward compatibility
-        return self.vocab_size
-
-    @n_words.setter
-    def n_words(self, value):  # For backward compatibility
-        self.vocab_size = value
-
-    @property
-    def hidden_size(self):
-        return self.emb_dim
-
-    @property
-    def num_attention_heads(self):
-        return self.n_heads
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layers
diff --git a/server/transformers/src/transformers/configuration_xlm_roberta.py b/server/transformers/src/transformers/configuration_xlm_roberta.py
deleted file mode 100644
index 330bc0d41f125399dd95bcf25a13ca1c75f272b0..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_xlm_roberta.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" XLM-RoBERTa configuration """
-
-
-import logging
-
-from .configuration_roberta import RobertaConfig
-
-
-logger = logging.getLogger(__name__)
-
-XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
-    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
-    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
-    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
-    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
-    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
-}
-
-
-class XLMRobertaConfig(RobertaConfig):
-    """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "xlm-roberta"
diff --git a/server/transformers/src/transformers/configuration_xlnet.py b/server/transformers/src/transformers/configuration_xlnet.py
deleted file mode 100644
index 42f6a00c5fd77a4d8528f9762169af3a2cb1ad26..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/configuration_xlnet.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" XLNet configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
-    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
-}
-
-
-class XLNetConfig(PretrainedConfig):
-    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
-        It is used to instantiate an XLNet model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 32000):
-                Vocabulary size of the XLNet model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            ff_activation (:obj:`string`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            attn_type (:obj:`string`, optional, defaults to "bi"):
-                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens to cache. The key/value pairs that have already been pre-computed
-                in a previous forward pass won't be re-computed. See the
-                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
-                for more information.
-            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens in the current batch to be cached and reused in the future.
-            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use bidirectional input pipeline. Usually set to `True` during
-                pretraining and `False` during finetuning.
-            clamp_len (:obj:`int`, optional, defaults to -1):
-                Clamp all relative distances larger than clamp_len.
-                Setting this attribute to -1 means no clamping.
-            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use the same attention length for each token.
-            summary_type (:obj:`string`, optional, defaults to "last"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Is one of the following options:
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a dropout after the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-
-        Example::
-
-            from transformers import XLNetConfig, XLNetModel
-
-            # Initializing a XLNet configuration
-            configuration = XLNetConfig()
-
-            # Initializing a model from the configuration
-            model = XLNetModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-
-    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "xlnet"
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        d_model=1024,
-        n_layer=24,
-        n_head=16,
-        d_inner=4096,
-        ff_activation="gelu",
-        untie_r=True,
-        attn_type="bi",
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        dropout=0.1,
-        mem_len=None,
-        reuse_len=None,
-        bi_data=False,
-        clamp_len=-1,
-        same_length=False,
-        summary_type="last",
-        summary_use_proj=True,
-        summary_activation="tanh",
-        summary_last_dropout=0.1,
-        start_n_top=5,
-        end_n_top=5,
-        **kwargs
-    ):
-        """Constructs XLNetConfig.
-        """
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.n_layer = n_layer
-        self.n_head = n_head
-        assert d_model % n_head == 0
-        self.d_head = d_model // n_head
-        self.ff_activation = ff_activation
-        self.d_inner = d_inner
-        self.untie_r = untie_r
-        self.attn_type = attn_type
-
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-        self.dropout = dropout
-        self.mem_len = mem_len
-        self.reuse_len = reuse_len
-        self.bi_data = bi_data
-        self.clamp_len = clamp_len
-        self.same_length = same_length
-
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_last_dropout = summary_last_dropout
-        self.start_n_top = start_n_top
-        self.end_n_top = end_n_top
-
-    @property
-    def max_position_embeddings(self):
-        return -1
-
-    @property
-    def n_token(self):  # Backward compatibility
-        return self.vocab_size
-
-    @n_token.setter
-    def n_token(self, value):  # Backward compatibility
-        self.vocab_size = value
-
-    @property
-    def hidden_size(self):
-        return self.d_model
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 88658d5a9fd77771b675c0e7c825845c03c0312f..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALBERT checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = AlbertConfig.from_json_file(albert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = AlbertForMaskedLM(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--albert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained ALBERT model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
diff --git a/server/transformers/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 806ace556a80feba96cd2e1a2fbb97d4ae6d5f5e..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = BertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained BERT model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/server/transformers/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/server/transformers/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
deleted file mode 100644
index c451521a461b67ae26a830dbe17b45fbd141a463..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
-
-import argparse
-import os
-
-import numpy as np
-import tensorflow as tf
-import torch
-
-from transformers import BertModel
-
-
-def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
-
-    """
-    :param model:BertModel Pytorch model instance to be converted
-    :param ckpt_dir: Tensorflow model directory
-    :param model_name: model name
-    :return:
-
-    Currently supported HF models:
-        Y BertModel
-        N BertForMaskedLM
-        N BertForPreTraining
-        N BertForMultipleChoice
-        N BertForNextSentencePrediction
-        N BertForSequenceClassification
-        N BertForQuestionAnswering
-    """
-
-    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
-
-    var_map = (
-        ("layer.", "layer_"),
-        ("word_embeddings.weight", "word_embeddings"),
-        ("position_embeddings.weight", "position_embeddings"),
-        ("token_type_embeddings.weight", "token_type_embeddings"),
-        (".", "/"),
-        ("LayerNorm/weight", "LayerNorm/gamma"),
-        ("LayerNorm/bias", "LayerNorm/beta"),
-        ("weight", "kernel"),
-    )
-
-    if not os.path.isdir(ckpt_dir):
-        os.makedirs(ckpt_dir)
-
-    state_dict = model.state_dict()
-
-    def to_tf_var_name(name: str):
-        for patt, repl in iter(var_map):
-            name = name.replace(patt, repl)
-        return "bert/{}".format(name)
-
-    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
-        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
-        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
-        session.run(tf.variables_initializer([tf_var]))
-        session.run(tf_var)
-        return tf_var
-
-    tf.reset_default_graph()
-    with tf.Session() as session:
-        for var_name in state_dict:
-            tf_name = to_tf_var_name(var_name)
-            torch_tensor = state_dict[var_name].numpy()
-            if any([x in var_name for x in tensors_to_transpose]):
-                torch_tensor = torch_tensor.T
-            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
-            tf.keras.backend.set_value(tf_var, torch_tensor)
-            tf_weight = session.run(tf_var)
-            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
-
-        saver = tf.train.Saver(tf.trainable_variables())
-        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
-
-
-def main(raw_args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
-    parser.add_argument(
-        "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
-    )
-    parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
-    parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
-    args = parser.parse_args(raw_args)
-
-    model = BertModel.from_pretrained(
-        pretrained_model_name_or_path=args.model_name,
-        state_dict=torch.load(args.pytorch_model_path),
-        cache_dir=args.cache_dir,
-    )
-
-    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index d86b6b0c8861d6f0d7d60be6256fa7342a3affea..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if gpt2_config_file == "":
-        config = GPT2Config()
-    else:
-        config = GPT2Config.from_json_file(gpt2_config_file)
-    model = GPT2Model(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print("Save configuration file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--gpt2_config_file",
-        default="",
-        type=str,
-        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
-        "This specifies the model architecture.",
-    )
-    args = parser.parse_args()
-    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
diff --git a/server/transformers/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index a1e1b80272005ee42dc74fc6696a8f867510dd20..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if openai_config_file == "":
-        config = OpenAIGPTConfig()
-    else:
-        config = OpenAIGPTConfig.from_json_file(openai_config_file)
-    model = OpenAIGPTModel(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print("Save configuration file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--openai_checkpoint_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--openai_config_file",
-        default="",
-        type=str,
-        help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
-        "This specifies the model architecture.",
-    )
-    args = parser.parse_args()
-    convert_openai_checkpoint_to_pytorch(
-        args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
-    )
diff --git a/server/transformers/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/server/transformers/src/transformers/convert_pytorch_checkpoint_to_tf2.py
deleted file mode 100644
index a8032f2662e7071b0593117ab9adb0654908504d..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Convert pytorch checkpoints to TensorFlow """
-
-
-import argparse
-import logging
-import os
-
-from transformers import (
-    ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    AlbertConfig,
-    BertConfig,
-    CamembertConfig,
-    CTRLConfig,
-    DistilBertConfig,
-    GPT2Config,
-    OpenAIGPTConfig,
-    RobertaConfig,
-    T5Config,
-    TFAlbertForMaskedLM,
-    TFBertForPreTraining,
-    TFBertForQuestionAnswering,
-    TFBertForSequenceClassification,
-    TFCamembertForMaskedLM,
-    TFCTRLLMHeadModel,
-    TFDistilBertForMaskedLM,
-    TFDistilBertForQuestionAnswering,
-    TFGPT2LMHeadModel,
-    TFOpenAIGPTLMHeadModel,
-    TFRobertaForMaskedLM,
-    TFRobertaForSequenceClassification,
-    TFT5WithLMHeadModel,
-    TFTransfoXLLMHeadModel,
-    TFXLMRobertaForMaskedLM,
-    TFXLMWithLMHeadModel,
-    TFXLNetLMHeadModel,
-    TransfoXLConfig,
-    XLMConfig,
-    XLMRobertaConfig,
-    XLNetConfig,
-    cached_path,
-    is_torch_available,
-    load_pytorch_checkpoint_in_tf2_model,
-)
-
-
-if is_torch_available():
-    import torch
-    import numpy as np
-    from transformers import (
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2LMHeadModel,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLNetLMHeadModel,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLMWithLMHeadModel,
-        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLMRobertaForMaskedLM,
-        TransfoXLLMHeadModel,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        OpenAIGPTLMHeadModel,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        RobertaForMaskedLM,
-        RobertaForSequenceClassification,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CamembertForMaskedLM,
-        CamembertForSequenceClassification,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        DistilBertForMaskedLM,
-        DistilBertForQuestionAnswering,
-        DistilBertForSequenceClassification,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CTRLLMHeadModel,
-        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        AlbertForMaskedLM,
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        T5WithLMHeadModel,
-        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-else:
-    (
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2LMHeadModel,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLNetLMHeadModel,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLMWithLMHeadModel,
-        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLMRobertaForMaskedLM,
-        TransfoXLLMHeadModel,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        OpenAIGPTLMHeadModel,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        RobertaForMaskedLM,
-        RobertaForSequenceClassification,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CamembertForMaskedLM,
-        CamembertForSequenceClassification,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        DistilBertForMaskedLM,
-        DistilBertForSequenceClassification,
-        DistilBertForQuestionAnswering,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CTRLLMHeadModel,
-        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        AlbertForMaskedLM,
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        T5WithLMHeadModel,
-        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    ) = (
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-logging.basicConfig(level=logging.INFO)
-
-MODEL_CLASSES = {
-    "bert": (
-        BertConfig,
-        TFBertForPreTraining,
-        BertForPreTraining,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "bert-large-uncased-whole-word-masking-finetuned-squad": (
-        BertConfig,
-        TFBertForQuestionAnswering,
-        BertForQuestionAnswering,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "bert-large-cased-whole-word-masking-finetuned-squad": (
-        BertConfig,
-        TFBertForQuestionAnswering,
-        BertForQuestionAnswering,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "bert-base-cased-finetuned-mrpc": (
-        BertConfig,
-        TFBertForSequenceClassification,
-        BertForSequenceClassification,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "gpt2": (
-        GPT2Config,
-        TFGPT2LMHeadModel,
-        GPT2LMHeadModel,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "xlnet": (
-        XLNetConfig,
-        TFXLNetLMHeadModel,
-        XLNetLMHeadModel,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "xlm": (
-        XLMConfig,
-        TFXLMWithLMHeadModel,
-        XLMWithLMHeadModel,
-        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "xlm-roberta": (
-        XLMRobertaConfig,
-        TFXLMRobertaForMaskedLM,
-        XLMRobertaForMaskedLM,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "transfo-xl": (
-        TransfoXLConfig,
-        TFTransfoXLLMHeadModel,
-        TransfoXLLMHeadModel,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "openai-gpt": (
-        OpenAIGPTConfig,
-        TFOpenAIGPTLMHeadModel,
-        OpenAIGPTLMHeadModel,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "roberta": (
-        RobertaConfig,
-        TFRobertaForMaskedLM,
-        RobertaForMaskedLM,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "roberta-large-mnli": (
-        RobertaConfig,
-        TFRobertaForSequenceClassification,
-        RobertaForSequenceClassification,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "camembert": (
-        CamembertConfig,
-        TFCamembertForMaskedLM,
-        CamembertForMaskedLM,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "distilbert": (
-        DistilBertConfig,
-        TFDistilBertForMaskedLM,
-        DistilBertForMaskedLM,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "distilbert-base-uncased-distilled-squad": (
-        DistilBertConfig,
-        TFDistilBertForQuestionAnswering,
-        DistilBertForQuestionAnswering,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "ctrl": (
-        CTRLConfig,
-        TFCTRLLMHeadModel,
-        CTRLLMHeadModel,
-        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "albert": (
-        AlbertConfig,
-        TFAlbertForMaskedLM,
-        AlbertForMaskedLM,
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-    "t5": (
-        T5Config,
-        TFT5WithLMHeadModel,
-        T5WithLMHeadModel,
-        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    ),
-}
-
-
-def convert_pt_checkpoint_to_tf(
-    model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True
-):
-    if model_type not in MODEL_CLASSES:
-        raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
-
-    config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
-
-    # Initialise TF model
-    if config_file in aws_config_map:
-        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
-    config = config_class.from_json_file(config_file)
-    config.output_hidden_states = True
-    config.output_attentions = True
-    print("Building TensorFlow model from configuration: {}".format(str(config)))
-    tf_model = model_class(config)
-
-    # Load weights from tf checkpoint
-    if pytorch_checkpoint_path in aws_model_maps:
-        pytorch_checkpoint_path = cached_path(
-            aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models
-        )
-    # Load PyTorch checkpoint in tf2 model:
-    tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
-
-    if compare_with_pt_model:
-        tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
-
-        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
-        pt_model = pt_model_class.from_pretrained(
-            pretrained_model_name_or_path=None, config=config, state_dict=state_dict
-        )
-
-        with torch.no_grad():
-            pto = pt_model(**pt_model.dummy_inputs)
-
-        np_pt = pto[0].numpy()
-        np_tf = tfo[0].numpy()
-        diff = np.amax(np.abs(np_pt - np_tf))
-        print("Max absolute difference between models outputs {}".format(diff))
-        assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
-
-    # Save pytorch-model
-    print("Save TensorFlow model to {}".format(tf_dump_path))
-    tf_model.save_weights(tf_dump_path, save_format="h5")
-
-
-def convert_all_pt_checkpoints_to_tf(
-    args_model_type,
-    tf_dump_path,
-    model_shortcut_names_or_path=None,
-    config_shortcut_names_or_path=None,
-    compare_with_pt_model=False,
-    use_cached_models=False,
-    remove_cached_files=False,
-    only_convert_finetuned_models=False,
-):
-    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
-
-    if args_model_type is None:
-        model_types = list(MODEL_CLASSES.keys())
-    else:
-        model_types = [args_model_type]
-
-    for j, model_type in enumerate(model_types, start=1):
-        print("=" * 100)
-        print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
-        print("=" * 100)
-        if model_type not in MODEL_CLASSES:
-            raise ValueError(
-                "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys()))
-            )
-
-        config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
-
-        if model_shortcut_names_or_path is None:
-            model_shortcut_names_or_path = list(aws_model_maps.keys())
-        if config_shortcut_names_or_path is None:
-            config_shortcut_names_or_path = model_shortcut_names_or_path
-
-        for i, (model_shortcut_name, config_shortcut_name) in enumerate(
-            zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1
-        ):
-            print("-" * 100)
-            if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name:
-                if not only_convert_finetuned_models:
-                    print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
-                    continue
-                model_type = model_shortcut_name
-            elif only_convert_finetuned_models:
-                print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
-                continue
-            print(
-                "    Converting checkpoint {}/{}: {} - model_type {}".format(
-                    i, len(aws_config_map), model_shortcut_name, model_type
-                )
-            )
-            print("-" * 100)
-
-            if config_shortcut_name in aws_config_map:
-                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
-            else:
-                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
-
-            if model_shortcut_name in aws_model_maps:
-                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
-            else:
-                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
-
-            if os.path.isfile(model_shortcut_name):
-                model_shortcut_name = "converted_model"
-
-            convert_pt_checkpoint_to_tf(
-                model_type=model_type,
-                pytorch_checkpoint_path=model_file,
-                config_file=config_file,
-                tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"),
-                compare_with_pt_model=compare_with_pt_model,
-            )
-            if remove_cached_files:
-                os.remove(config_file)
-                os.remove(model_file)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file."
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(
-            list(MODEL_CLASSES.keys())
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_checkpoint_path",
-        default=None,
-        type=str,
-        help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
-        "If not given, will download and convert all the checkpoints from AWS.",
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        help="The config json file corresponding to the pre-trained model. \n"
-        "This specifies the model architecture. If not given and "
-        "--pytorch_checkpoint_path is not given or is a shortcut name"
-        "use the configuration associated to the shortcut name on the AWS",
-    )
-    parser.add_argument(
-        "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions."
-    )
-    parser.add_argument(
-        "--use_cached_models",
-        action="store_true",
-        help="Use cached models if possible instead of updating to latest checkpoint versions.",
-    )
-    parser.add_argument(
-        "--remove_cached_files",
-        action="store_true",
-        help="Remove pytorch models after conversion (save memory when converting in batches).",
-    )
-    parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.")
-    args = parser.parse_args()
-
-    # if args.pytorch_checkpoint_path is not None:
-    #     convert_pt_checkpoint_to_tf(args.model_type.lower(),
-    #                                 args.pytorch_checkpoint_path,
-    #                                 args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
-    #                                 args.tf_dump_path,
-    #                                 compare_with_pt_model=args.compare_with_pt_model,
-    #                                 use_cached_models=args.use_cached_models)
-    # else:
-    convert_all_pt_checkpoints_to_tf(
-        args.model_type.lower() if args.model_type is not None else None,
-        args.tf_dump_path,
-        model_shortcut_names_or_path=[args.pytorch_checkpoint_path]
-        if args.pytorch_checkpoint_path is not None
-        else None,
-        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
-        compare_with_pt_model=args.compare_with_pt_model,
-        use_cached_models=args.use_cached_models,
-        remove_cached_files=args.remove_cached_files,
-        only_convert_finetuned_models=args.only_convert_finetuned_models,
-    )
diff --git a/server/transformers/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index df4c3414360851a5e1fca1dab0543a5712a34522..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-
-import argparse
-import logging
-import pathlib
-
-import fairseq
-import torch
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers.modeling_bert import (
-    BertConfig,
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.modeling_roberta import RobertaForMaskedLM, RobertaForSequenceClassification
-
-
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
-    roberta.eval()  # disable dropout
-    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
-    config = BertConfig(
-        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=roberta.args.encoder_embed_dim,
-        num_hidden_layers=roberta.args.encoder_layers,
-        num_attention_heads=roberta.args.encoder_attention_heads,
-        intermediate_size=roberta.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.args.num_classes
-    print("Our BERT config:", config)
-
-    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c RoBERTa doesn't use them.
-    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
-    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            roberta_layer.self_attn.k_proj.weight.data.shape
-            == roberta_layer.self_attn.q_proj.weight.data.shape
-            == roberta_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
-        model.lm_head.bias = roberta.model.decoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/server/transformers/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index e497a5a64163c80c6a9f1eb94ab62452e26dc108..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The T5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert T5 checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import T5Config, T5Model, load_tf_weights_in_t5
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = T5Model(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained T5 model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/server/transformers/src/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 3a9048ba8e831446330fad4cde255d566d4f9e7c..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Transformer XL checkpoint and datasets."""
-
-
-import argparse
-import logging
-import os
-import pickle
-import sys
-
-import torch
-
-import transformers.tokenization_transfo_xl as data_utils
-from transformers import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    TransfoXLConfig,
-    TransfoXLLMHeadModel,
-    load_tf_weights_in_transfo_xl,
-)
-from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
-
-
-logging.basicConfig(level=logging.INFO)
-
-# We do this to be able to load python 2 datasets pickles
-# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
-data_utils.Vocab = data_utils.TransfoXLTokenizer
-data_utils.Corpus = data_utils.TransfoXLCorpus
-sys.modules["data_utils"] = data_utils
-sys.modules["vocabulary"] = data_utils
-
-
-def convert_transfo_xl_checkpoint_to_pytorch(
-    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
-):
-    if transfo_xl_dataset_file:
-        # Convert a pre-processed corpus (see original TensorFlow repo)
-        with open(transfo_xl_dataset_file, "rb") as fp:
-            corpus = pickle.load(fp, encoding="latin1")
-        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
-        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
-        corpus_vocab_dict = corpus.vocab.__dict__
-        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
-
-        corpus_dict_no_vocab = corpus.__dict__
-        corpus_dict_no_vocab.pop("vocab", None)
-        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
-        print("Save dataset to {}".format(pytorch_dataset_dump_path))
-        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
-
-    if tf_checkpoint_path:
-        # Convert a pre-trained TensorFlow model
-        config_path = os.path.abspath(transfo_xl_config_file)
-        tf_path = os.path.abspath(tf_checkpoint_path)
-
-        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
-        # Initialise PyTorch model
-        if transfo_xl_config_file == "":
-            config = TransfoXLConfig()
-        else:
-            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
-        print("Building PyTorch model from configuration: {}".format(str(config)))
-        model = TransfoXLLMHeadModel(config)
-
-        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
-        # Save pytorch-model
-        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
-        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--tf_checkpoint_path",
-        default="",
-        type=str,
-        help="An optional path to a TensorFlow checkpoint path to be converted.",
-    )
-    parser.add_argument(
-        "--transfo_xl_config_file",
-        default="",
-        type=str,
-        help="An optional config json file corresponding to the pre-trained BERT model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--transfo_xl_dataset_file",
-        default="",
-        type=str,
-        help="An optional dataset file to be converted in a vocabulary.",
-    )
-    args = parser.parse_args()
-    convert_transfo_xl_checkpoint_to_pytorch(
-        args.tf_checkpoint_path,
-        args.transfo_xl_config_file,
-        args.pytorch_dump_folder_path,
-        args.transfo_xl_dataset_file,
-    )
diff --git a/server/transformers/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index 7d66dc5b3132c0a635d50f14693bd815da1bd180..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-
-import argparse
-import json
-import logging
-
-import numpy
-import torch
-
-from transformers import CONFIG_NAME, WEIGHTS_NAME
-from transformers.tokenization_xlm import VOCAB_FILES_NAMES
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
-    # Load checkpoint
-    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
-
-    state_dict = chkpt["model"]
-
-    # We have the base model one level deeper than the original XLM repository
-    two_levels_state_dict = {}
-    for k, v in state_dict.items():
-        if "pred_layer" in k:
-            two_levels_state_dict[k] = v
-        else:
-            two_levels_state_dict["transformer." + k] = v
-
-    config = chkpt["params"]
-    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
-
-    vocab = chkpt["dico_word2id"]
-    vocab = dict((s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
-
-    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
-
-    print("Save configuration file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(config, indent=2) + "\n")
-
-    print("Save vocab file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(vocab, indent=2) + "\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/server/transformers/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/server/transformers/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 51eed0e1214aa0bce2d1adffabb0b599d0dfa0fa..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-
-import argparse
-import logging
-import os
-
-import torch
-
-from transformers import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetForSequenceClassification,
-    XLNetLMHeadModel,
-    load_tf_weights_in_xlnet,
-)
-
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_xlnet_checkpoint_to_pytorch(
-    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
-):
-    # Initialise PyTorch model
-    config = XLNetConfig.from_json_file(bert_config_file)
-
-    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
-    if finetuning_task in GLUE_TASKS_NUM_LABELS:
-        print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
-        config.finetuning_task = finetuning_task
-        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
-        model = XLNetForSequenceClassification(config)
-    elif "squad" in finetuning_task:
-        config.finetuning_task = finetuning_task
-        model = XLNetForQuestionAnswering(config)
-    else:
-        model = XLNetLMHeadModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-    print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--xlnet_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained XLNet model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--finetuning_task",
-        default=None,
-        type=str,
-        help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
-    )
-    args = parser.parse_args()
-    print(args)
-
-    convert_xlnet_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
-    )
diff --git a/server/transformers/src/transformers/data/__init__.py b/server/transformers/src/transformers/data/__init__.py
deleted file mode 100644
index 8d5f6b85b0292359a77a08b2b7f8d8d334f4202b..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-from .metrics import is_sklearn_available
-from .processors import (
-    DataProcessor,
-    InputExample,
-    InputFeatures,
-    SingleSentenceClassificationProcessor,
-    SquadExample,
-    SquadFeatures,
-    SquadV1Processor,
-    SquadV2Processor,
-    glue_convert_examples_to_features,
-    glue_output_modes,
-    glue_processors,
-    glue_tasks_num_labels,
-    squad_convert_examples_to_features,
-    xnli_output_modes,
-    xnli_processors,
-    xnli_tasks_num_labels,
-)
-
-
-if is_sklearn_available():
-    from .metrics import glue_compute_metrics, xnli_compute_metrics
diff --git a/server/transformers/src/transformers/data/metrics/__init__.py b/server/transformers/src/transformers/data/metrics/__init__.py
deleted file mode 100644
index 6c29c2313dd4bde827b724e1b0b24b2e300047da..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/metrics/__init__.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    from scipy.stats import pearsonr, spearmanr
-    from sklearn.metrics import matthews_corrcoef, f1_score
-
-    _has_sklearn = True
-except (AttributeError, ImportError):
-    _has_sklearn = False
-
-
-def is_sklearn_available():
-    return _has_sklearn
-
-
-if _has_sklearn:
-
-    def simple_accuracy(preds, labels):
-        return (preds == labels).mean()
-
-    def acc_and_f1(preds, labels):
-        acc = simple_accuracy(preds, labels)
-        f1 = f1_score(y_true=labels, y_pred=preds)
-        return {
-            "acc": acc,
-            "f1": f1,
-            "acc_and_f1": (acc + f1) / 2,
-        }
-
-    def pearson_and_spearman(preds, labels):
-        pearson_corr = pearsonr(preds, labels)[0]
-        spearman_corr = spearmanr(preds, labels)[0]
-        return {
-            "pearson": pearson_corr,
-            "spearmanr": spearman_corr,
-            "corr": (pearson_corr + spearman_corr) / 2,
-        }
-
-    def glue_compute_metrics(task_name, preds, labels):
-        assert len(preds) == len(labels)
-        if task_name == "cola":
-            return {"mcc": matthews_corrcoef(labels, preds)}
-        elif task_name == "sst-2":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "mrpc":
-            return acc_and_f1(preds, labels)
-        elif task_name == "sts-b":
-            return pearson_and_spearman(preds, labels)
-        elif task_name == "qqp":
-            return acc_and_f1(preds, labels)
-        elif task_name == "mnli":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "mnli-mm":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "qnli":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "rte":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "wnli":
-            return {"acc": simple_accuracy(preds, labels)}
-        elif task_name == "hans":
-            return {"acc": simple_accuracy(preds, labels)}
-        else:
-            raise KeyError(task_name)
-
-    def xnli_compute_metrics(task_name, preds, labels):
-        assert len(preds) == len(labels)
-        if task_name == "xnli":
-            return {"acc": simple_accuracy(preds, labels)}
-        else:
-            raise KeyError(task_name)
diff --git a/server/transformers/src/transformers/data/metrics/squad_metrics.py b/server/transformers/src/transformers/data/metrics/squad_metrics.py
deleted file mode 100644
index 54fdeb7c7ea1a4d69d7b380aba0f781153fb2ec7..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/metrics/squad_metrics.py
+++ /dev/null
@@ -1,757 +0,0 @@
-""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
-modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
-
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-
-
-import collections
-import json
-import logging
-import math
-import re
-import string
-
-from transformers.tokenization_bert import BasicTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-def compute_exact(a_gold, a_pred):
-    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-
-def compute_f1(a_gold, a_pred):
-    gold_toks = get_tokens(a_gold)
-    pred_toks = get_tokens(a_pred)
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def get_raw_scores(examples, preds):
-    """
-    Computes the exact and f1 scores from the examples and the model predictions
-    """
-    exact_scores = {}
-    f1_scores = {}
-
-    for example in examples:
-        qas_id = example.qas_id
-        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
-
-        if not gold_answers:
-            # For unanswerable questions, only correct answer is empty string
-            gold_answers = [""]
-
-        if qas_id not in preds:
-            print("Missing prediction for %s" % qas_id)
-            continue
-
-        prediction = preds[qas_id]
-        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
-        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
-
-    return exact_scores, f1_scores
-
-
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-    new_scores = {}
-    for qid, s in scores.items():
-        pred_na = na_probs[qid] > na_prob_thresh
-        if pred_na:
-            new_scores[qid] = float(not qid_to_has_ans[qid])
-        else:
-            new_scores[qid] = s
-    return new_scores
-
-
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-    if not qid_list:
-        total = len(exact_scores)
-        return collections.OrderedDict(
-            [
-                ("exact", 100.0 * sum(exact_scores.values()) / total),
-                ("f1", 100.0 * sum(f1_scores.values()) / total),
-                ("total", total),
-            ]
-        )
-    else:
-        total = len(qid_list)
-        return collections.OrderedDict(
-            [
-                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-                ("total", total),
-            ]
-        )
-
-
-def merge_eval(main_eval, new_eval, prefix):
-    for k in new_eval:
-        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
-
-
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-    cur_score = num_no_ans
-    best_score = cur_score
-    best_thresh = 0.0
-    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-    for i, qid in enumerate(qid_list):
-        if qid not in scores:
-            continue
-        if qid_to_has_ans[qid]:
-            diff = scores[qid]
-        else:
-            if preds[qid]:
-                diff = -1
-            else:
-                diff = 0
-        cur_score += diff
-        if cur_score > best_score:
-            best_score = cur_score
-            best_thresh = na_probs[qid]
-
-    has_ans_score, has_ans_cnt = 0, 0
-    for qid in qid_list:
-        if not qid_to_has_ans[qid]:
-            continue
-        has_ans_cnt += 1
-
-        if qid not in scores:
-            continue
-        has_ans_score += scores[qid]
-
-    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-
-
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-    main_eval["best_exact"] = best_exact
-    main_eval["best_exact_thresh"] = exact_thresh
-    main_eval["best_f1"] = best_f1
-    main_eval["best_f1_thresh"] = f1_thresh
-    main_eval["has_ans_exact"] = has_ans_exact
-    main_eval["has_ans_f1"] = has_ans_f1
-
-
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-    cur_score = num_no_ans
-    best_score = cur_score
-    best_thresh = 0.0
-    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-    for _, qid in enumerate(qid_list):
-        if qid not in scores:
-            continue
-        if qid_to_has_ans[qid]:
-            diff = scores[qid]
-        else:
-            if preds[qid]:
-                diff = -1
-            else:
-                diff = 0
-        cur_score += diff
-        if cur_score > best_score:
-            best_score = cur_score
-            best_thresh = na_probs[qid]
-    return 100.0 * best_score / len(scores), best_thresh
-
-
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-
-    main_eval["best_exact"] = best_exact
-    main_eval["best_exact_thresh"] = exact_thresh
-    main_eval["best_f1"] = best_f1
-    main_eval["best_f1_thresh"] = f1_thresh
-
-
-def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
-    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
-    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
-    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
-
-    if no_answer_probs is None:
-        no_answer_probs = {k: 0.0 for k in preds}
-
-    exact, f1 = get_raw_scores(examples, preds)
-
-    exact_threshold = apply_no_ans_threshold(
-        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
-    )
-    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
-
-    evaluation = make_eval_dict(exact_threshold, f1_threshold)
-
-    if has_answer_qids:
-        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
-        merge_eval(evaluation, has_ans_eval, "HasAns")
-
-    if no_answer_qids:
-        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
-        merge_eval(evaluation, no_ans_eval, "NoAns")
-
-    if no_answer_probs:
-        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
-
-    return evaluation
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-def compute_predictions_logits(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    verbose_logging,
-    version_2_with_negative,
-    null_score_diff_threshold,
-    tokenizer,
-):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    logger.info("Writing predictions to: %s" % (output_prediction_file))
-    logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
-    )
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index],
-                        )
-                    )
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit,
-                )
-            )
-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"]
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-
-                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-                # tok_text = " ".join(tok_tokens)
-                #
-                # # De-tokenize WordPieces that have been split off.
-                # tok_text = tok_text.replace(" ##", "")
-                # tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
-
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest) == 1:
-                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
-
-
-def compute_predictions_log_probs(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    start_n_top,
-    end_n_top,
-    version_2_with_negative,
-    tokenizer,
-    verbose_logging,
-):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
-
-        Requires utils_squad_evaluate.py
-    """
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
-    )
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
-    )
-
-    logger.info("Writing predictions to: %s", output_prediction_file)
-    # logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-
-            cur_null_score = result.cls_logits
-
-            # if we could have irrelevant answers, get the min score of irrelevant
-            score_null = min(score_null, cur_null_score)
-
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_log_prob = result.start_logits[i]
-                    start_index = result.start_top_index[i]
-
-                    j_index = i * end_n_top + j
-
-                    end_log_prob = result.end_logits[j_index]
-                    end_index = result.end_top_index[j_index]
-
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= feature.paragraph_len - 1:
-                        continue
-                    if end_index >= feature.paragraph_len - 1:
-                        continue
-
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob,
-                        )
-                    )
-
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-
-            # XLNet un-tokenizer
-            # Let's keep it simple for now and see if we need all this later.
-            #
-            # tok_start_to_orig_index = feature.tok_start_to_orig_index
-            # tok_end_to_orig_index = feature.tok_end_to_orig_index
-            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
-            # paragraph_text = example.paragraph_text
-            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
-
-            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            if hasattr(tokenizer, "do_lower_case"):
-                do_lower_case = tokenizer.do_lower_case
-            else:
-                do_lower_case = tokenizer.do_lowercase_and_remove_accent
-
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
-            )
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_log_prob + entry.end_log_prob)
-            if not best_non_null_entry:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_log_prob"] = entry.start_log_prob
-            output["end_log_prob"] = entry.end_log_prob
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-        assert best_non_null_entry is not None
-
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        # note(zhiliny): always predict best_non_null_entry
-        # and the evaluation script will search for the best threshold
-        all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
diff --git a/server/transformers/src/transformers/data/processors/__init__.py b/server/transformers/src/transformers/data/processors/__init__.py
deleted file mode 100644
index 4cb37faf2511f8ee48d7efb83ff38fca92cae892..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/processors/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
-from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
-from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
-from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/server/transformers/src/transformers/data/processors/glue.py b/server/transformers/src/transformers/data/processors/glue.py
deleted file mode 100644
index 87885577fabb564556626dbaee549ad2bb0be4fb..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/processors/glue.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" GLUE processors and helpers """
-
-import logging
-import os
-
-from ...file_utils import is_tf_available
-from .utils import DataProcessor, InputExample, InputFeatures
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-def glue_convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_length=512,
-    task=None,
-    label_list=None,
-    output_mode=None,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """
-    Loads a data file into a list of ``InputFeatures``
-
-    Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples
-        max_length: Maximum example length
-        task: GLUE task
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
-        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
-        pad_token: Padding token
-        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
-        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
-            actual values)
-
-    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-        a list of task-specific ``InputFeatures`` which can be fed to the model.
-
-    """
-    is_tf_dataset = False
-    if is_tf_available() and isinstance(examples, tf.data.Dataset):
-        is_tf_dataset = True
-
-    if task is not None:
-        processor = glue_processors[task]()
-        if label_list is None:
-            label_list = processor.get_labels()
-            logger.info("Using label list %s for task %s" % (label_list, task))
-        if output_mode is None:
-            output_mode = glue_output_modes[task]
-            logger.info("Using output mode %s for task %s" % (output_mode, task))
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        len_examples = 0
-        if is_tf_dataset:
-            example = processor.get_example_from_tensor_dict(example)
-            example = processor.tfds_map(example)
-            len_examples = tf.data.experimental.cardinality(examples)
-        else:
-            len_examples = len(examples)
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d/%d" % (ex_index, len_examples))
-
-        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
-        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
-            len(attention_mask), max_length
-        )
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
-            len(token_type_ids), max_length
-        )
-
-        if output_mode == "classification":
-            label = label_map[example.label]
-        elif output_mode == "regression":
-            label = float(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
-            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label))
-
-        features.append(
-            InputFeatures(
-                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
-            )
-        )
-
-    if is_tf_available() and is_tf_dataset:
-
-        def gen():
-            for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    ex.label,
-                )
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                tf.TensorShape([]),
-            ),
-        )
-
-    return features
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence1"].numpy().decode("utf-8"),
-            tensor_dict["sentence2"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["premise"].numpy().decode("utf-8"),
-            tensor_dict["hypothesis"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence"].numpy().decode("utf-8"),
-            None,
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence"].numpy().decode("utf-8"),
-            None,
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence1"].numpy().decode("utf-8"),
-            tensor_dict["sentence2"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["question1"].numpy().decode("utf-8"),
-            tensor_dict["question2"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["question"].numpy().decode("utf-8"),
-            tensor_dict["sentence"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence1"].numpy().decode("utf-8"),
-            tensor_dict["sentence2"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["sentence1"].numpy().decode("utf-8"),
-            tensor_dict["sentence2"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-glue_tasks_num_labels = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
-
-glue_processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-}
-
-glue_output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-}
diff --git a/server/transformers/src/transformers/data/processors/squad.py b/server/transformers/src/transformers/data/processors/squad.py
deleted file mode 100644
index f2e63e939497399c8d942bdb7012c88cb5d39927..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/processors/squad.py
+++ /dev/null
@@ -1,710 +0,0 @@
-import json
-import logging
-import os
-from functools import partial
-from multiprocessing import Pool, cpu_count
-
-import numpy as np
-from tqdm import tqdm
-
-from ...file_utils import is_tf_available, is_torch_available
-from ...tokenization_bert import whitespace_tokenize
-from .utils import DataProcessor
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data import TensorDataset
-
-if is_tf_available():
-    import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _new_check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # if len(doc_spans) == 1:
-    # return True
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span["start"] + doc_span["length"] - 1
-        if position < doc_span["start"]:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span["start"]
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
-    features = []
-    if is_training and not example.is_impossible:
-        # Get start and end position
-        start_position = example.start_position
-        end_position = example.end_position
-
-        # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
-        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
-        if actual_text.find(cleaned_answer_text) == -1:
-            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
-            return []
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    if is_training and not example.is_impossible:
-        tok_start_position = orig_to_tok_index[example.start_position]
-        if example.end_position < len(example.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-
-        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-        )
-
-    spans = []
-
-    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-    sequence_added_tokens = (
-        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
-        if "roberta" in str(type(tokenizer))
-        else tokenizer.max_len - tokenizer.max_len_single_sentence
-    )
-    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
-
-    span_doc_tokens = all_doc_tokens
-    while len(spans) * doc_stride < len(all_doc_tokens):
-
-        encoded_dict = tokenizer.encode_plus(
-            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
-            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
-            max_length=max_seq_length,
-            return_overflowing_tokens=True,
-            pad_to_max_length=True,
-            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
-        )
-
-        paragraph_len = min(
-            len(all_doc_tokens) - len(spans) * doc_stride,
-            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
-        )
-
-        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
-            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
-        else:
-            non_padded_ids = encoded_dict["input_ids"]
-
-        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
-
-        token_to_orig_map = {}
-        for i in range(paragraph_len):
-            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
-            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
-
-        encoded_dict["paragraph_len"] = paragraph_len
-        encoded_dict["tokens"] = tokens
-        encoded_dict["token_to_orig_map"] = token_to_orig_map
-        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
-        encoded_dict["token_is_max_context"] = {}
-        encoded_dict["start"] = len(spans) * doc_stride
-        encoded_dict["length"] = paragraph_len
-
-        spans.append(encoded_dict)
-
-        if "overflowing_tokens" not in encoded_dict:
-            break
-        span_doc_tokens = encoded_dict["overflowing_tokens"]
-
-    for doc_span_index in range(len(spans)):
-        for j in range(spans[doc_span_index]["paragraph_len"]):
-            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = (
-                j
-                if tokenizer.padding_side == "left"
-                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
-            )
-            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
-
-    for span in spans:
-        # Identify the position of the CLS token
-        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
-
-        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implem also keep the classification token (set to 0) (not sure why...)
-        p_mask = np.array(span["token_type_ids"])
-
-        p_mask = np.minimum(p_mask, 1)
-
-        if tokenizer.padding_side == "right":
-            # Limit positive values to one
-            p_mask = 1 - p_mask
-
-        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
-
-        # Set the CLS index to '0'
-        p_mask[cls_index] = 0
-
-        span_is_impossible = example.is_impossible
-        start_position = 0
-        end_position = 0
-        if is_training and not span_is_impossible:
-            # For training, if our document chunk does not contain an annotation
-            # we throw it out, since there is nothing to predict.
-            doc_start = span["start"]
-            doc_end = span["start"] + span["length"] - 1
-            out_of_span = False
-
-            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                out_of_span = True
-
-            if out_of_span:
-                start_position = cls_index
-                end_position = cls_index
-                span_is_impossible = True
-            else:
-                if tokenizer.padding_side == "left":
-                    doc_offset = 0
-                else:
-                    doc_offset = len(truncated_query) + sequence_added_tokens
-
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-
-        features.append(
-            SquadFeatures(
-                span["input_ids"],
-                span["attention_mask"],
-                span["token_type_ids"],
-                cls_index,
-                p_mask.tolist(),
-                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
-                unique_id=0,
-                paragraph_len=span["paragraph_len"],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=span_is_impossible,
-            )
-        )
-    return features
-
-
-def squad_convert_example_to_features_init(tokenizer_for_convert):
-    global tokenizer
-    tokenizer = tokenizer_for_convert
-
-
-def squad_convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
-):
-    """
-    Converts a list of examples into a list of features that can be directly given as input to a model.
-    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
-
-    Args:
-        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
-        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
-        max_seq_length: The maximum sequence length of the inputs.
-        doc_stride: The stride used when the context is too large and is split across several features.
-        max_query_length: The maximum length of the query.
-        is_training: whether to create features for model evaluation or model training.
-        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset,
-            if 'tf': returns a tf.data.Dataset
-        threads: multiple processing threadsa-smi
-
-
-    Returns:
-        list of :class:`~transformers.data.processors.squad.SquadFeatures`
-
-    Example::
-
-        processor = SquadV2Processor()
-        examples = processor.get_dev_examples(data_dir)
-
-        features = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-        )
-    """
-
-    # Defining helper methods
-    features = []
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(
-            squad_convert_example_to_features,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_training=is_training,
-        )
-        features = list(
-            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
-                total=len(examples),
-                desc="convert squad examples to features",
-            )
-        )
-    new_features = []
-    unique_id = 1000000000
-    example_index = 0
-    for example_features in tqdm(features, total=len(features), desc="add example index and unique id"):
-        if not example_features:
-            continue
-        for example_feature in example_features:
-            example_feature.example_index = example_index
-            example_feature.unique_id = unique_id
-            new_features.append(example_feature)
-            unique_id += 1
-        example_index += 1
-    features = new_features
-    del new_features
-    if return_dataset == "pt":
-        if not is_torch_available():
-            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
-
-        # Convert to Tensors and build dataset
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
-
-        if not is_training:
-            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
-            )
-        else:
-            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids,
-                all_attention_masks,
-                all_token_type_ids,
-                all_start_positions,
-                all_end_positions,
-                all_cls_index,
-                all_p_mask,
-                all_is_impossible,
-            )
-
-        return features, dataset
-    elif return_dataset == "tf":
-        if not is_tf_available():
-            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
-
-        def gen():
-            for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    {
-                        "start_position": ex.start_position,
-                        "end_position": ex.end_position,
-                        "cls_index": ex.cls_index,
-                        "p_mask": ex.p_mask,
-                        "is_impossible": ex.is_impossible,
-                    },
-                )
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            (
-                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
-                {
-                    "start_position": tf.int64,
-                    "end_position": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            ),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                {
-                    "start_position": tf.TensorShape([]),
-                    "end_position": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            ),
-        )
-
-    return features
-
-
-class SquadProcessor(DataProcessor):
-    """
-    Processor for the SQuAD data set.
-    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
-    """
-
-    train_file = None
-    dev_file = None
-
-    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
-        if not evaluate:
-            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
-            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
-            answers = []
-        else:
-            answers = [
-                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
-                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
-            ]
-
-            answer = None
-            answer_start = None
-
-        return SquadExample(
-            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
-            question_text=tensor_dict["question"].numpy().decode("utf-8"),
-            context_text=tensor_dict["context"].numpy().decode("utf-8"),
-            answer_text=answer,
-            start_position_character=answer_start,
-            title=tensor_dict["title"].numpy().decode("utf-8"),
-            answers=answers,
-        )
-
-    def get_examples_from_dataset(self, dataset, evaluate=False):
-        """
-        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
-
-        Args:
-            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
-            evaluate: boolean specifying if in evaluation mode or in training mode
-
-        Returns:
-            List of SquadExample
-
-        Examples::
-
-            import tensorflow_datasets as tfds
-            dataset = tfds.load("squad")
-
-            training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
-        """
-
-        if evaluate:
-            dataset = dataset["validation"]
-        else:
-            dataset = dataset["train"]
-
-        examples = []
-        for tensor_dict in tqdm(dataset):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
-
-        return examples
-
-    def get_train_examples(self, data_dir, filename=None):
-        """
-        Returns the training examples from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the training file has a different name than the original one
-                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.train_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "train")
-
-    def get_dev_examples(self, data_dir, filename=None):
-        """
-        Returns the evaluation example from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the evaluation file has a different name than the original one
-                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.dev_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "dev")
-
-    def _create_examples(self, input_data, set_type):
-        is_training = set_type == "train"
-        examples = []
-        for entry in tqdm(input_data):
-            title = entry["title"]
-            for paragraph in entry["paragraphs"]:
-                context_text = paragraph["context"]
-                for qa in paragraph["qas"]:
-                    qas_id = qa["id"]
-                    question_text = qa["question"]
-                    start_position_character = None
-                    answer_text = None
-                    answers = []
-
-                    if "is_impossible" in qa:
-                        is_impossible = qa["is_impossible"]
-                    else:
-                        is_impossible = False
-
-                    if not is_impossible:
-                        if is_training:
-                            answer = qa["answers"][0]
-                            answer_text = answer["text"]
-                            start_position_character = answer["answer_start"]
-                        else:
-                            answers = qa["answers"]
-
-                    example = SquadExample(
-                        qas_id=qas_id,
-                        question_text=question_text,
-                        context_text=context_text,
-                        answer_text=answer_text,
-                        start_position_character=start_position_character,
-                        title=title,
-                        is_impossible=is_impossible,
-                        answers=answers,
-                    )
-
-                    examples.append(example)
-        return examples
-
-
-class SquadV1Processor(SquadProcessor):
-    train_file = "train-v1.1.json"
-    dev_file = "dev-v1.1.json"
-
-
-class SquadV2Processor(SquadProcessor):
-    train_file = "train-v2.0.json"
-    dev_file = "dev-v2.0.json"
-
-
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset, as loaded from disk.
-
-    Args:
-        qas_id: The example's unique identifier
-        question_text: The question string
-        context_text: The context string
-        answer_text: The answer string
-        start_position_character: The character position of the start of the answer
-        title: The title of the example
-        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
-        is_impossible: False by default, set to True if the example has no possible answer.
-    """
-
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-    ):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.context_text = context_text
-        self.answer_text = answer_text
-        self.title = title
-        self.is_impossible = is_impossible
-        self.answers = answers
-
-        self.start_position, self.end_position = 0, 0
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        for c in self.context_text:
-            if _is_whitespace(c):
-                prev_is_whitespace = True
-            else:
-                if prev_is_whitespace:
-                    doc_tokens.append(c)
-                else:
-                    doc_tokens[-1] += c
-                prev_is_whitespace = False
-            char_to_word_offset.append(len(doc_tokens) - 1)
-
-        self.doc_tokens = doc_tokens
-        self.char_to_word_offset = char_to_word_offset
-
-        # Start end end positions only has a value during evaluation.
-        if start_position_character is not None and not is_impossible:
-            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
-            ]
-
-
-class SquadFeatures(object):
-    """
-    Single squad example features to be fed to a model.
-    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
-    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        cls_index: the index of the CLS token.
-        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
-            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
-        example_index: the index of the example
-        unique_id: The unique Feature identifier
-        paragraph_len: The length of the context
-        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
-            If a token does not have their maximum context in this feature object, it means that another feature object
-            has more information related to that token and should be prioritized over this feature for that token.
-        tokens: list of tokens corresponding to the input ids
-        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
-        start_position: start of the answer token index
-        end_position: end of the answer token index
-    """
-
-    def __init__(
-        self,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        cls_index,
-        p_mask,
-        example_index,
-        unique_id,
-        paragraph_len,
-        token_is_max_context,
-        tokens,
-        token_to_orig_map,
-        start_position,
-        end_position,
-        is_impossible,
-    ):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-
-        self.example_index = example_index
-        self.unique_id = unique_id
-        self.paragraph_len = paragraph_len
-        self.token_is_max_context = token_is_max_context
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-class SquadResult(object):
-    """
-    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
-
-    Args:
-        unique_id: The unique identifier corresponding to that example.
-        start_logits: The logits corresponding to the start of the answer
-        end_logits: The logits corresponding to the end of the answer
-    """
-
-    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
-        self.start_logits = start_logits
-        self.end_logits = end_logits
-        self.unique_id = unique_id
-
-        if start_top_index:
-            self.start_top_index = start_top_index
-            self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
diff --git a/server/transformers/src/transformers/data/processors/utils.py b/server/transformers/src/transformers/data/processors/utils.py
deleted file mode 100644
index 4cc931cdf9ccded2abfec08d9d5044c4acafb7ac..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/processors/utils.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import csv
-import json
-import logging
-
-from ...file_utils import is_tf_available, is_torch_available
-
-
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-    """
-    A single training/test example for simple sequence classification.
-
-    Args:
-        guid: Unique id for the example.
-        text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-        text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-
-class InputFeatures(object):
-    """
-    A single set of features of data.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        label: Label corresponding to the input
-    """
-
-    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.label = label
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
-        This method converts examples to the correct format."""
-        if len(self.get_labels()) > 1:
-            example.label = self.get_labels()[int(example.label)]
-        return example
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
-
-
-class SingleSentenceClassificationProcessor(DataProcessor):
-    """ Generic processor for a single sentence classification data set."""
-
-    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
-        self.labels = [] if labels is None else labels
-        self.examples = [] if examples is None else examples
-        self.mode = mode
-        self.verbose = verbose
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, idx):
-        if isinstance(idx, slice):
-            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
-        return self.examples[idx]
-
-    @classmethod
-    def create_from_csv(
-        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
-    ):
-        processor = cls(**kwargs)
-        processor.add_examples_from_csv(
-            file_name,
-            split_name=split_name,
-            column_label=column_label,
-            column_text=column_text,
-            column_id=column_id,
-            skip_first_row=skip_first_row,
-            overwrite_labels=True,
-            overwrite_examples=True,
-        )
-        return processor
-
-    @classmethod
-    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
-        processor = cls(**kwargs)
-        processor.add_examples(texts_or_text_and_labels, labels=labels)
-        return processor
-
-    def add_examples_from_csv(
-        self,
-        file_name,
-        split_name="",
-        column_label=0,
-        column_text=1,
-        column_id=None,
-        skip_first_row=False,
-        overwrite_labels=False,
-        overwrite_examples=False,
-    ):
-        lines = self._read_tsv(file_name)
-        if skip_first_row:
-            lines = lines[1:]
-        texts = []
-        labels = []
-        ids = []
-        for (i, line) in enumerate(lines):
-            texts.append(line[column_text])
-            labels.append(line[column_label])
-            if column_id is not None:
-                ids.append(line[column_id])
-            else:
-                guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
-                ids.append(guid)
-
-        return self.add_examples(
-            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
-        )
-
-    def add_examples(
-        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
-    ):
-        assert labels is None or len(texts_or_text_and_labels) == len(labels)
-        assert ids is None or len(texts_or_text_and_labels) == len(ids)
-        if ids is None:
-            ids = [None] * len(texts_or_text_and_labels)
-        if labels is None:
-            labels = [None] * len(texts_or_text_and_labels)
-        examples = []
-        added_labels = set()
-        for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids):
-            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
-                text, label = text_or_text_and_label
-            else:
-                text = text_or_text_and_label
-            added_labels.add(label)
-            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
-
-        # Update examples
-        if overwrite_examples:
-            self.examples = examples
-        else:
-            self.examples.extend(examples)
-
-        # Update labels
-        if overwrite_labels:
-            self.labels = list(added_labels)
-        else:
-            self.labels = list(set(self.labels).union(added_labels))
-
-        return self.examples
-
-    def get_features(
-        self,
-        tokenizer,
-        max_length=None,
-        pad_on_left=False,
-        pad_token=0,
-        mask_padding_with_zero=True,
-        return_tensors=None,
-    ):
-        """
-        Convert examples in a list of ``InputFeatures``
-
-        Args:
-            tokenizer: Instance of a tokenizer that will tokenize the examples
-            max_length: Maximum example length
-            task: GLUE task
-            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
-            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
-            pad_token: Padding token
-            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
-                actual values)
-
-        Returns:
-            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-            containing the task-specific features. If the input is a list of ``InputExamples``, will return
-            a list of task-specific ``InputFeatures`` which can be fed to the model.
-
-        """
-        if max_length is None:
-            max_length = tokenizer.max_len
-
-        label_map = {label: i for i, label in enumerate(self.labels)}
-
-        all_input_ids = []
-        for (ex_index, example) in enumerate(self.examples):
-            if ex_index % 10000 == 0:
-                logger.info("Tokenizing example %d", ex_index)
-
-            input_ids = tokenizer.encode(
-                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
-            )
-            all_input_ids.append(input_ids)
-
-        batch_length = max(len(input_ids) for input_ids in all_input_ids)
-
-        features = []
-        for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)):
-            if ex_index % 10000 == 0:
-                logger.info("Writing example %d/%d" % (ex_index, len(self.examples)))
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = batch_length - len(input_ids)
-            if pad_on_left:
-                input_ids = ([pad_token] * padding_length) + input_ids
-                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-            else:
-                input_ids = input_ids + ([pad_token] * padding_length)
-                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-
-            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(
-                len(input_ids), batch_length
-            )
-            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(
-                len(attention_mask), batch_length
-            )
-
-            if self.mode == "classification":
-                label = label_map[example.label]
-            elif self.mode == "regression":
-                label = float(example.label)
-            else:
-                raise ValueError(self.mode)
-
-            if ex_index < 5 and self.verbose:
-                logger.info("*** Example ***")
-                logger.info("guid: %s" % (example.guid))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
-                logger.info("label: %s (id = %d)" % (example.label, label))
-
-            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
-
-        if return_tensors is None:
-            return features
-        elif return_tensors == "tf":
-            if not is_tf_available():
-                raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
-            import tensorflow as tf
-
-            def gen():
-                for ex in features:
-                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
-
-            dataset = tf.data.Dataset.from_generator(
-                gen,
-                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
-                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
-            )
-            return dataset
-        elif return_tensors == "pt":
-            if not is_torch_available():
-                raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
-            import torch
-            from torch.utils.data import TensorDataset
-
-            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-            if self.mode == "classification":
-                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-            elif self.mode == "regression":
-                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
-            return dataset
-        else:
-            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
diff --git a/server/transformers/src/transformers/data/processors/xnli.py b/server/transformers/src/transformers/data/processors/xnli.py
deleted file mode 100644
index 6a744c6280145efb3b305c775db9931dcc8f3e25..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/data/processors/xnli.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" XNLI utils (dataset loading and evaluation) """
-
-
-import logging
-import os
-
-from .utils import DataProcessor, InputExample
-
-
-logger = logging.getLogger(__name__)
-
-
-class XnliProcessor(DataProcessor):
-    """Processor for the XNLI dataset.
-    Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
-
-    def __init__(self, language, train_language=None):
-        self.language = language
-        self.train_language = train_language
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        lg = self.language if self.train_language is None else self.train_language
-        lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % ("train", i)
-            text_a = line[0]
-            text_b = line[1]
-            label = "contradiction" if line[2] == "contradictory" else line[2]
-            assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            language = line[0]
-            if language != self.language:
-                continue
-            guid = "%s-%s" % ("test", i)
-            text_a = line[6]
-            text_b = line[7]
-            label = line[1]
-            assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-
-xnli_processors = {
-    "xnli": XnliProcessor,
-}
-
-xnli_output_modes = {
-    "xnli": "classification",
-}
-
-xnli_tasks_num_labels = {
-    "xnli": 3,
-}
diff --git a/server/transformers/src/transformers/file_utils.py b/server/transformers/src/transformers/file_utils.py
deleted file mode 100644
index 8aafa95f432aaad668bb8acddd063ed8d2b265b3..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/file_utils.py
+++ /dev/null
@@ -1,432 +0,0 @@
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-
-import fnmatch
-import json
-import logging
-import os
-import sys
-import tempfile
-from contextlib import contextmanager
-from functools import partial, wraps
-from hashlib import sha256
-from typing import Optional
-from urllib.parse import urlparse
-
-import boto3
-import requests
-from botocore.config import Config
-from botocore.exceptions import ClientError
-from filelock import FileLock
-from tqdm.auto import tqdm
-
-from . import __version__
-
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-try:
-    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-    if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
-        import torch
-
-        _torch_available = True  # pylint: disable=invalid-name
-        logger.info("PyTorch version {} available.".format(torch.__version__))
-    else:
-        logger.info("Disabling PyTorch because USE_TF is set")
-        _torch_available = False
-except ImportError:
-    _torch_available = False  # pylint: disable=invalid-name
-
-try:
-    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-
-    if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
-        import tensorflow as tf
-
-        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
-        _tf_available = True  # pylint: disable=invalid-name
-        logger.info("TensorFlow version {} available.".format(tf.__version__))
-    else:
-        logger.info("Disabling Tensorflow because USE_TORCH is set")
-        _tf_available = False
-except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
-
-try:
-    from torch.hub import _get_torch_home
-
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-    )
-default_cache_path = os.path.join(torch_cache_home, "transformers")
-
-try:
-    from pathlib import Path
-
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
-    )
-except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
-        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-    )
-
-PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
-TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = "tf_model.h5"
-TF_WEIGHTS_NAME = "model.ckpt"
-CONFIG_NAME = "config.json"
-MODEL_CARD_NAME = "modelcard.json"
-
-
-MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]]
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_tf_available():
-    return _tf_available
-
-
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_start_docstrings_to_callable(*docstr):
-    def docstring_decorator(fn):
-        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
-        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
-        note = r"""
-
-    .. note::
-        Although the recipe for forward pass needs to be defined within
-        this function, one should call the :class:`Module` instance afterwards
-        instead of this since the former takes care of running the
-        pre and post processing steps while the latter silently ignores them.
-        """
-        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_end_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + "".join(docstr)
-        return fn
-
-    return docstring_decorator
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https", "s3")
-
-
-def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
-    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
-    if postfix is None:
-        return "/".join((endpoint, identifier))
-    else:
-        return "/".join((endpoint, identifier, postfix))
-
-
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
-    so that TF 2.0 can identify it as a HDF5 file
-    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
-def cached_path(
-    url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None
-) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletly recieved file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-
-
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-
-    return wrapper
-
-
-@s3_request
-def s3_etag(url, proxies=None):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file, proxies=None):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
-    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
-    if is_torch_available():
-        ua += "; torch/{}".format(torch.__version__)
-    if is_tf_available():
-        ua += "; tensorflow/{}".format(tf.__version__)
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    headers = {"user-agent": ua}
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
-    )
-    for chunk in response.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None
-) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    # Get eTag to add to filename, if it exists.
-    if url.startswith("s3://"):
-        etag = s3_etag(url, proxies=proxies)
-    else:
-        try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-            if response.status_code != 200:
-                etag = None
-            else:
-                etag = response.headers.get("ETag")
-        except (EnvironmentError, requests.exceptions.Timeout):
-            etag = None
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                return None
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                if resume_download:
-                    logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
-                s3_get(url, temp_file, proxies=proxies)
-            else:
-                http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
-
-        logger.info("storing %s in cache at %s", url, cache_path)
-        os.rename(temp_file.name, cache_path)
-
-        logger.info("creating metadata file for %s", cache_path)
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
diff --git a/server/transformers/src/transformers/hf_api.py b/server/transformers/src/transformers/hf_api.py
deleted file mode 100644
index c8da5615e5db698f0d36b8627c7972d91ab3af63..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/hf_api.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import io
-import os
-from os.path import expanduser
-from typing import List
-
-import requests
-from tqdm import tqdm
-
-
-ENDPOINT = "https://huggingface.co"
-
-
-class S3Obj:
-    def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs):
-        self.filename = filename
-        self.LastModified = LastModified
-        self.ETag = ETag
-        self.Size = Size
-
-
-class PresignedUrl:
-    def __init__(self, write: str, access: str, type: str, **kwargs):
-        self.write = write
-        self.access = access
-        self.type = type  # mime-type to send to S3.
-
-
-class HfApi:
-    def __init__(self, endpoint=None):
-        self.endpoint = endpoint if endpoint is not None else ENDPOINT
-
-    def login(self, username: str, password: str) -> str:
-        """
-        Call HF API to sign in a user and get a token if credentials are valid.
-
-        Outputs:
-            token if credentials are valid
-
-        Throws:
-            requests.exceptions.HTTPError if credentials are invalid
-        """
-        path = "{}/api/login".format(self.endpoint)
-        r = requests.post(path, json={"username": username, "password": password})
-        r.raise_for_status()
-        d = r.json()
-        return d["token"]
-
-    def whoami(self, token: str) -> str:
-        """
-        Call HF API to know "whoami"
-        """
-        path = "{}/api/whoami".format(self.endpoint)
-        r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-        d = r.json()
-        return d["user"]
-
-    def logout(self, token: str) -> None:
-        """
-        Call HF API to log out.
-        """
-        path = "{}/api/logout".format(self.endpoint)
-        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-
-    def presign(self, token: str, filename: str) -> PresignedUrl:
-        """
-        Call HF API to get a presigned url to upload `filename` to S3.
-        """
-        path = "{}/api/presign".format(self.endpoint)
-        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
-        r.raise_for_status()
-        d = r.json()
-        return PresignedUrl(**d)
-
-    def presign_and_upload(self, token: str, filename: str, filepath: str) -> str:
-        """
-        Get a presigned url, then upload file to S3.
-
-        Outputs:
-            url: Read-only url for the stored file on S3.
-        """
-        urls = self.presign(token, filename=filename)
-        # streaming upload:
-        # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
-        #
-        # Even though we presign with the correct content-type,
-        # the client still has to specify it when uploading the file.
-        with open(filepath, "rb") as f:
-            pf = TqdmProgressFileReader(f)
-            data = f if pf.total_size > 0 else ""
-
-            r = requests.put(urls.write, data=data, headers={"content-type": urls.type})
-            r.raise_for_status()
-            pf.close()
-        return urls.access
-
-    def list_objs(self, token: str) -> List[S3Obj]:
-        """
-        Call HF API to list all stored files for user.
-        """
-        path = "{}/api/listObjs".format(self.endpoint)
-        r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-        d = r.json()
-        return [S3Obj(**x) for x in d]
-
-    def delete_obj(self, token: str, filename: str):
-        """
-        Call HF API to delete a file stored by user
-        """
-        path = "{}/api/deleteObj".format(self.endpoint)
-        r = requests.delete(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
-        r.raise_for_status()
-
-
-class TqdmProgressFileReader:
-    """
-    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
-    and override `f.read()` so as to display a tqdm progress bar.
-
-    see github.com/huggingface/transformers/pull/2078#discussion_r354739608
-    for implementation details.
-    """
-
-    def __init__(self, f: io.BufferedReader):
-        self.f = f
-        self.total_size = os.fstat(f.fileno()).st_size
-        self.pbar = tqdm(total=self.total_size, leave=False)
-        self.read = f.read
-        f.read = self._read
-
-    def _read(self, n=-1):
-        self.pbar.update(n)
-        return self.read(n)
-
-    def close(self):
-        self.pbar.close()
-
-
-class HfFolder:
-    path_token = expanduser("~/.huggingface/token")
-
-    @classmethod
-    def save_token(cls, token):
-        """
-        Save token, creating folder as needed.
-        """
-        os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
-        with open(cls.path_token, "w+") as f:
-            f.write(token)
-
-    @classmethod
-    def get_token(cls):
-        """
-        Get token or None if not existent.
-        """
-        try:
-            with open(cls.path_token, "r") as f:
-                return f.read()
-        except FileNotFoundError:
-            pass
-
-    @classmethod
-    def delete_token(cls):
-        """
-        Delete token.
-        Do not fail if token does not exist.
-        """
-        try:
-            os.remove(cls.path_token)
-        except FileNotFoundError:
-            pass
diff --git a/server/transformers/src/transformers/modelcard.py b/server/transformers/src/transformers/modelcard.py
deleted file mode 100644
index 7661a3615485c1c15b642688bf7b5064236a2cf7..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modelcard.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Configuration base class and utilities."""
-
-
-import copy
-import json
-import logging
-import os
-
-from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .file_utils import (
-    CONFIG_NAME,
-    MODEL_CARD_NAME,
-    TF2_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_remote_url,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-class ModelCard(object):
-    r""" Model Card class.
-        Store model card as well as methods for loading/downloading/saving model cards.
-
-        Please read the following paper for details and explanation on the sections:
-            "Model Cards for Model Reporting"
-                by Margaret Mitchell, Simone Wu,
-                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
-                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
-            Link: https://arxiv.org/abs/1810.03993
-
-        Note:
-            A model card can be loaded and saved to disk.
-
-        Parameters:
-    """
-
-    def __init__(self, **kwargs):
-        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
-        self.model_details = kwargs.pop("model_details", {})
-        self.intended_use = kwargs.pop("intended_use", {})
-        self.factors = kwargs.pop("factors", {})
-        self.metrics = kwargs.pop("metrics", {})
-        self.evaluation_data = kwargs.pop("evaluation_data", {})
-        self.training_data = kwargs.pop("training_data", {})
-        self.quantitative_analyses = kwargs.pop("quantitative_analyses", {})
-        self.ethical_considerations = kwargs.pop("ethical_considerations", {})
-        self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {})
-
-        # Open additional attributes
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error("Can't set {} with value {} for {}".format(key, value, self))
-                raise err
-
-    def save_pretrained(self, save_directory_or_file):
-        """ Save a model card object to the directory or file `save_directory_or_file`.
-        """
-        if os.path.isdir(save_directory_or_file):
-            # If we save using the predefined names, we can load using `from_pretrained`
-            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
-        else:
-            output_model_card_file = save_directory_or_file
-
-        self.to_json_file(output_model_card_file)
-        logger.info("Model card saved in {}".format(output_model_card_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
-
-        Parameters:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                card should be cached if the standard cache should not be used.
-
-            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
-
-                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            find_from_standard_name: (`optional`) boolean, default True:
-                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename.
-                Can be used to directly feed a model/config url and access the colocated modelcard.
-
-            return_unused_kwargs: (`optional`) bool:
-
-                - If False, then this function returns just the final model card object.
-                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
-
-        Examples::
-
-            modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
-            modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
-            modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
-            modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        proxies = kwargs.pop("proxies", None)
-        find_from_standard_name = kwargs.pop("find_from_standard_name", True)
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            # For simplicity we use the same pretrained url than the configuration files
-            # but with a different suffix (modelcard.json). This suffix is replaced below.
-            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            model_card_file = pretrained_model_name_or_path
-        else:
-            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
-
-        if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
-            model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME)
-            model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_model_card_file = cached_path(
-                model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False
-            )
-            if resolved_model_card_file is None:
-                raise EnvironmentError
-            if resolved_model_card_file == model_card_file:
-                logger.info("loading model card file {}".format(model_card_file))
-            else:
-                logger.info(
-                    "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file)
-                )
-            # Load model card
-            modelcard = cls.from_json_file(resolved_model_card_file)
-
-        except EnvironmentError:
-            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
-            else:
-                logger.warning(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url to a model card file named {} or "
-                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        model_card_file,
-                        MODEL_CARD_NAME,
-                    )
-                )
-            logger.warning("Creating an empty model card.")
-
-            # We fall back on creating an empty model card
-            modelcard = cls()
-
-        except json.JSONDecodeError:
-            logger.warning(
-                "Couldn't reach server at '{}' to download model card file or "
-                "model card file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
-            )
-            logger.warning("Creating an empty model card.")
-
-            # We fall back on creating an empty model card
-            modelcard = cls()
-
-        # Update model card with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(modelcard, key):
-                setattr(modelcard, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model card: %s", str(modelcard))
-        if return_unused_kwargs:
-            return modelcard, kwargs
-        else:
-            return modelcard
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `ModelCard` from a Python dictionary of parameters."""
-        return cls(**json_object)
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `ModelCard` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        dict_obj = json.loads(text)
-        return cls(**dict_obj)
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
diff --git a/server/transformers/src/transformers/modeling_albert.py b/server/transformers/src/transformers/modeling_albert.py
deleted file mode 100644
index d2a5d4878e5e4496b29c4d13dc156dc8b1126bd7..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_albert.py
+++ /dev/null
@@ -1,892 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch ALBERT model. """
-
-import logging
-import math
-import os
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from transformers.configuration_albert import AlbertConfig
-from transformers.modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
-from transformers.modeling_utils import PreTrainedModel
-
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-
-
-logger = logging.getLogger(__name__)
-
-
-ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
-    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
-    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
-    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
-    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
-    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
-    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
-    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
-}
-
-
-def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        print(name)
-
-    for name, array in zip(names, arrays):
-        original_name = name
-
-        # If saved from the TF HUB module
-        name = name.replace("module/", "")
-
-        # Renaming and simplifying
-        name = name.replace("ffn_1", "ffn")
-        name = name.replace("bert/", "albert/")
-        name = name.replace("attention_1", "attention")
-        name = name.replace("transform/", "")
-        name = name.replace("LayerNorm_1", "full_layer_layer_norm")
-        name = name.replace("LayerNorm", "attention/LayerNorm")
-        name = name.replace("transformer/", "")
-
-        # The feed forward layer had an 'intermediate' step which has been abstracted away
-        name = name.replace("intermediate/dense/", "")
-        name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
-
-        # ALBERT attention was split between self and output which have been abstracted away
-        name = name.replace("/output/", "/")
-        name = name.replace("/self/", "/")
-
-        # The pooler is a linear layer
-        name = name.replace("pooler/dense", "pooler")
-
-        # The classifier was simplified to predictions from cls/predictions
-        name = name.replace("cls/predictions", "predictions")
-        name = name.replace("predictions/attention", "predictions")
-
-        # Naming was changed to be more explicit
-        name = name.replace("embeddings/attention", "embeddings")
-        name = name.replace("inner_group_", "albert_layers/")
-        name = name.replace("group_", "albert_layer_groups/")
-
-        # Classifier
-        if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
-            name = "classifier/" + name
-
-        # No ALBERT model currently handles the next sentence prediction task
-        if "seq_relationship" in name:
-            continue
-
-        name = name.split("/")
-
-        # Ignore the gradients applied by the LAMB/ADAM optimizers.
-        if "adam_m" in name or "adam_v" in name or "global_step" in name:
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {} from {}".format(name, original_name))
-        pointer.data = torch.from_numpy(array)
-
-    return model
-
-
-class AlbertEmbeddings(BertEmbeddings):
-    """
-    Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
-        self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
-
-
-class AlbertAttention(BertSelfAttention):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.output_attentions = config.output_attentions
-        self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.attention_head_size = config.hidden_size // config.num_attention_heads
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.num_attention_heads, self.attention_head_size)
-        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-
-        # Prune linear layers
-        self.query = prune_linear_layer(self.query, index)
-        self.key = prune_linear_layer(self.key, index)
-        self.value = prune_linear_layer(self.value, index)
-        self.dense = prune_linear_layer(self.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.num_attention_heads = self.num_attention_heads - len(heads)
-        self.all_head_size = self.attention_head_size * self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(self, input_ids, attention_mask=None, head_mask=None):
-        mixed_query_layer = self.query(input_ids)
-        mixed_key_layer = self.key(input_ids)
-        mixed_value_layer = self.value(input_ids)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-
-        # Should find a better way to do this
-        w = (
-            self.dense.weight.t()
-            .view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
-            .to(context_layer.dtype)
-        )
-        b = self.dense.bias.to(context_layer.dtype)
-
-        projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
-        projected_context_layer_dropout = self.dropout(projected_context_layer)
-        layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout)
-        return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,)
-
-
-class AlbertLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.config = config
-        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.attention = AlbertAttention(config)
-        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.activation = ACT2FN[config.hidden_act]
-
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_output = self.attention(hidden_states, attention_mask, head_mask)
-        ffn_output = self.ffn(attention_output[0])
-        ffn_output = self.activation(ffn_output)
-        ffn_output = self.ffn_output(ffn_output)
-        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
-
-        return (hidden_states,) + attention_output[1:]  # add attentions if we output them
-
-
-class AlbertLayerGroup(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
-
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        layer_hidden_states = ()
-        layer_attentions = ()
-
-        for layer_index, albert_layer in enumerate(self.albert_layers):
-            layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index])
-            hidden_states = layer_output[0]
-
-            if self.output_attentions:
-                layer_attentions = layer_attentions + (layer_output[1],)
-
-            if self.output_hidden_states:
-                layer_hidden_states = layer_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (layer_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (layer_attentions,)
-        return outputs  # last-layer hidden state, (layer hidden states), (layer attentions)
-
-
-class AlbertTransformer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.config = config
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
-        self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
-
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
-
-        all_attentions = ()
-
-        if self.output_hidden_states:
-            all_hidden_states = (hidden_states,)
-
-        for i in range(self.config.num_hidden_layers):
-            # Number of layers in a hidden group
-            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
-
-            # Index of the hidden group
-            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
-
-            layer_group_output = self.albert_layer_groups[group_idx](
-                hidden_states,
-                attention_mask,
-                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
-            )
-            hidden_states = layer_group_output[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + layer_group_output[-1]
-
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-class AlbertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = AlbertConfig
-    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "albert"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, (nn.Linear)) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-ALBERT_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Args:
-        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-ALBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
-    ALBERT_START_DOCSTRING,
-)
-class AlbertModel(AlbertPreTrainedModel):
-
-    config_class = AlbertConfig
-    pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_albert
-    base_model_prefix = "albert"
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.config = config
-        self.embeddings = AlbertEmbeddings(config)
-        self.encoder = AlbertTransformer(config)
-        self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
-        self.pooler_activation = nn.Tanh()
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.embeddings.word_embeddings
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.embeddings.word_embeddings = new_embeddings
-        return self.embeddings.word_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
-            If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
-            is a total of 4 different layers.
-
-            These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
-            while [2,3] correspond to the two inner groups of the second hidden layer.
-
-            Any layer with in index other than [0,1,2,3] will result in an error.
-            See base class PreTrainedModel for more information about head pruning
-        """
-        for layer, heads in heads_to_prune.items():
-            group_idx = int(layer / self.config.inner_group_num)
-            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
-            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
-
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Example::
-
-        from transformers import AlbertModel, AlbertTokenizer
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertModel.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
-
-        sequence_output = encoder_outputs[0]
-
-        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
-
-        outputs = (sequence_output, pooled_output) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs
-
-
-class AlbertMLMHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.LayerNorm = nn.LayerNorm(config.embedding_size)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
-        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
-        self.activation = ACT2FN[config.hidden_act]
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-
-        prediction_scores = hidden_states + self.bias
-
-        return prediction_scores
-
-
-@add_start_docstrings(
-    "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
-)
-class AlbertForMaskedLM(AlbertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.albert = AlbertModel(config)
-        self.predictions = AlbertMLMHead(config)
-
-        self.init_weights()
-        self.tie_weights()
-
-    def tie_weights(self):
-        self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
-
-    def get_output_embeddings(self):
-        return self.predictions.decoder
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-    ):
-        r"""
-        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
-            labels in ``[0, ..., config.vocab_size]``
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Example::
-
-        from transformers import AlbertTokenizer, AlbertForMaskedLM
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
-        """
-        outputs = self.albert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        sequence_outputs = outputs[0]
-
-        prediction_scores = self.predictions(sequence_outputs)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss,) + outputs
-
-        return outputs
-
-
-@add_start_docstrings(
-    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING,
-)
-class AlbertForSequenceClassification(AlbertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.albert = AlbertModel(config)
-        self.dropout = nn.Dropout(config.classifier_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-        Examples::
-
-            from transformers import AlbertTokenizer, AlbertForSequenceClassification
-            import torch
-
-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
-
-        """
-
-        outputs = self.albert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ALBERT_START_DOCSTRING,
-)
-class AlbertForQuestionAnswering(AlbertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.albert = AlbertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import AlbertTokenizer, AlbertForQuestionAnswering
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
-        start_scores, end_scores = model(**input_dict)
-
-        """
-
-        outputs = self.albert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_auto.py b/server/transformers/src/transformers/modeling_auto.py
deleted file mode 100644
index fbc8bc03ad38c225754c3444253b1424ebec9e32..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_auto.py
+++ /dev/null
@@ -1,1128 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Auto Model class. """
-
-
-import logging
-from collections import OrderedDict
-
-from .configuration_auto import (
-    AlbertConfig,
-    AutoConfig,
-    BertConfig,
-    CamembertConfig,
-    CTRLConfig,
-    DistilBertConfig,
-    FlaubertConfig,
-    GPT2Config,
-    OpenAIGPTConfig,
-    RobertaConfig,
-    T5Config,
-    TransfoXLConfig,
-    XLMConfig,
-    XLMRobertaConfig,
-    XLNetConfig,
-)
-from .configuration_utils import PretrainedConfig
-from .modeling_albert import (
-    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForMaskedLM,
-    AlbertForQuestionAnswering,
-    AlbertForSequenceClassification,
-    AlbertModel,
-)
-from .modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForMaskedLM,
-    BertForPreTraining,
-    BertForQuestionAnswering,
-    BertForSequenceClassification,
-    BertForTokenClassification,
-    BertModel,
-)
-from .modeling_camembert import (
-    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    CamembertForMaskedLM,
-    CamembertForSequenceClassification,
-    CamembertForTokenClassification,
-    CamembertModel,
-)
-from .modeling_ctrl import CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRLModel
-from .modeling_distilbert import (
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForMaskedLM,
-    DistilBertForQuestionAnswering,
-    DistilBertForSequenceClassification,
-    DistilBertForTokenClassification,
-    DistilBertModel,
-)
-from .modeling_flaubert import (
-    FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    FlaubertForQuestionAnswering,
-    FlaubertForSequenceClassification,
-    FlaubertModel,
-    FlaubertWithLMHeadModel,
-)
-from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
-from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
-from .modeling_roberta import (
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForMaskedLM,
-    RobertaForQuestionAnswering,
-    RobertaForSequenceClassification,
-    RobertaForTokenClassification,
-    RobertaModel,
-)
-from .modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5Model, T5WithLMHeadModel
-from .modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TransfoXLModel
-from .modeling_xlm import (
-    XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLMForQuestionAnswering,
-    XLMForSequenceClassification,
-    XLMModel,
-    XLMWithLMHeadModel,
-)
-from .modeling_xlm_roberta import (
-    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLMRobertaForMaskedLM,
-    XLMRobertaForSequenceClassification,
-    XLMRobertaForTokenClassification,
-    XLMRobertaModel,
-)
-from .modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForQuestionAnswering,
-    XLNetForSequenceClassification,
-    XLNetForTokenClassification,
-    XLNetLMHeadModel,
-    XLNetModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
-    (key, value)
-    for pretrained_map in [
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-        FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    ]
-    for key, value, in pretrained_map.items()
-)
-
-MODEL_MAPPING = OrderedDict(
-    [
-        (T5Config, T5Model),
-        (DistilBertConfig, DistilBertModel),
-        (AlbertConfig, AlbertModel),
-        (CamembertConfig, CamembertModel),
-        (XLMRobertaConfig, XLMRobertaModel),
-        (RobertaConfig, RobertaModel),
-        (BertConfig, BertModel),
-        (OpenAIGPTConfig, OpenAIGPTModel),
-        (GPT2Config, GPT2Model),
-        (TransfoXLConfig, TransfoXLModel),
-        (XLNetConfig, XLNetModel),
-        (FlaubertConfig, FlaubertModel),
-        (XLMConfig, XLMModel),
-        (CTRLConfig, CTRLModel),
-    ]
-)
-
-MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
-    [
-        (T5Config, T5WithLMHeadModel),
-        (DistilBertConfig, DistilBertForMaskedLM),
-        (AlbertConfig, AlbertForMaskedLM),
-        (CamembertConfig, CamembertForMaskedLM),
-        (XLMRobertaConfig, XLMRobertaForMaskedLM),
-        (RobertaConfig, RobertaForMaskedLM),
-        (BertConfig, BertForPreTraining),
-        (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
-        (GPT2Config, GPT2LMHeadModel),
-        (TransfoXLConfig, TransfoXLLMHeadModel),
-        (XLNetConfig, XLNetLMHeadModel),
-        (FlaubertConfig, FlaubertWithLMHeadModel),
-        (XLMConfig, XLMWithLMHeadModel),
-        (CTRLConfig, CTRLLMHeadModel),
-    ]
-)
-
-MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
-    [
-        (T5Config, T5WithLMHeadModel),
-        (DistilBertConfig, DistilBertForMaskedLM),
-        (AlbertConfig, AlbertForMaskedLM),
-        (CamembertConfig, CamembertForMaskedLM),
-        (XLMRobertaConfig, XLMRobertaForMaskedLM),
-        (RobertaConfig, RobertaForMaskedLM),
-        (BertConfig, BertForMaskedLM),
-        (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
-        (GPT2Config, GPT2LMHeadModel),
-        (TransfoXLConfig, TransfoXLLMHeadModel),
-        (XLNetConfig, XLNetLMHeadModel),
-        (FlaubertConfig, FlaubertWithLMHeadModel),
-        (XLMConfig, XLMWithLMHeadModel),
-        (CTRLConfig, CTRLLMHeadModel),
-    ]
-)
-
-MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, DistilBertForSequenceClassification),
-        (AlbertConfig, AlbertForSequenceClassification),
-        (CamembertConfig, CamembertForSequenceClassification),
-        (XLMRobertaConfig, XLMRobertaForSequenceClassification),
-        (RobertaConfig, RobertaForSequenceClassification),
-        (BertConfig, BertForSequenceClassification),
-        (XLNetConfig, XLNetForSequenceClassification),
-        (FlaubertConfig, FlaubertForSequenceClassification),
-        (XLMConfig, XLMForSequenceClassification),
-    ]
-)
-
-MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, DistilBertForQuestionAnswering),
-        (AlbertConfig, AlbertForQuestionAnswering),
-        (RobertaConfig, RobertaForQuestionAnswering),
-        (BertConfig, BertForQuestionAnswering),
-        (XLNetConfig, XLNetForQuestionAnswering),
-        (FlaubertConfig, FlaubertForQuestionAnswering),
-        (XLMConfig, XLMForQuestionAnswering),
-    ]
-)
-
-MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, DistilBertForTokenClassification),
-        (CamembertConfig, CamembertForTokenClassification),
-        (XLMRobertaConfig, XLMRobertaForTokenClassification),
-        (RobertaConfig, RobertaForTokenClassification),
-        (BertConfig, BertForTokenClassification),
-        (XLNetConfig, XLNetForTokenClassification),
-    ]
-)
-
-
-class AutoModel(object):
-    r"""
-        :class:`~transformers.AutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
-        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        or the `AutoModel.from_config(config)` class methods.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModel is designed to be instantiated "
-            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModel.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModel` (DistilBERT model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModel` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertModel` (Bert model)
-                - isInstance of `openai-gpt` configuration class: :class:`~transformers.OpenAIGPTModel` (OpenAI GPT model)
-                - isInstance of `gpt2` configuration class: :class:`~transformers.GPT2Model` (OpenAI GPT-2 model)
-                - isInstance of `ctrl` configuration class: :class:`~transformers.CTRLModel` (Salesforce CTRL  model)
-                - isInstance of `transfo-xl` configuration class: :class:`~transformers.TransfoXLModel` (Transformer-XL model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModel` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.XLMModel` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertModel` (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: :class:`~transformers.T5Model` (T5 model)
-            - contains `distilbert`: :class:`~transformers.DistilBertModel` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertModel` (ALBERT model)
-            - contains `camembert`: :class:`~transformers.CamembertModel` (CamemBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaModel` (XLM-RoBERTa model)
-            - contains `roberta`: :class:`~transformers.RobertaModel` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.BertModel` (Bert model)
-            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTModel` (OpenAI GPT model)
-            - contains `gpt2`: :class:`~transformers.GPT2Model` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: :class:`~transformers.TransfoXLModel` (Transformer-XL model)
-            - contains `xlnet`: :class:`~transformers.XLNetModel` (XLNet model)
-            - contains `xlm`: :class:`~transformers.XLMModel` (XLM model)
-            - contains `ctrl`: :class:`~transformers.CTRLModel` (Salesforce CTRL  model)
-            - contains `flaubert`: :class:`~transformers.Flaubert` (Flaubert  model)
-
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_MAPPING.keys())
-            )
-        )
-
-
-class AutoModelForPreTraining(object):
-    r"""
-        :class:`~transformers.AutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModelForPreTraining is designed to be instantiated "
-            "using the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForPreTraining.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModelForMaskedLM` (DistilBERT model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForMaskedLM` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertForPreTraining` (Bert model)
-                - isInstance of `openai-gpt` configuration class: :class:`~transformers.OpenAIGPTLMHeadModel` (OpenAI GPT model)
-                - isInstance of `gpt2` configuration class: :class:`~transformers.GPT2ModelLMHeadModel` (OpenAI GPT-2 model)
-                - isInstance of `ctrl` configuration class: :class:`~transformers.CTRLModelLMHeadModel` (Salesforce CTRL  model)
-                - isInstance of `transfo-xl` configuration class: :class:`~transformers.TransfoXLLMHeadModel` (Transformer-XL model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: :class:`~transformers.T5ModelWithLMHead` (T5 model)
-            - contains `distilbert`: :class:`~transformers.DistilBertForMaskedLM` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertForMaskedLM` (ALBERT model)
-            - contains `camembert`: :class:`~transformers.CamembertForMaskedLM` (CamemBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForMaskedLM` (XLM-RoBERTa model)
-            - contains `roberta`: :class:`~transformers.RobertaForMaskedLM` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.BertForPreTraining` (Bert model)
-            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTLMHeadModel` (OpenAI GPT model)
-            - contains `gpt2`: :class:`~transformers.GPT2LMHeadModel` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: :class:`~transformers.TransfoXLLMHeadModel` (Transformer-XL model)
-            - contains `xlnet`: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
-            - contains `xlm`: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
-            - contains `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
-            - contains `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path:
-                Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model.
-                (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                  underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
-                  already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                  initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                  ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                  with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                  attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelForPreTraining.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelForPreTraining.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelForPreTraining.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_PRETRAINING_MAPPING.keys())
-            )
-        )
-
-
-class AutoModelWithLMHead(object):
-    r"""
-        :class:`~transformers.AutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelWithLMHead.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModelForMaskedLM` (DistilBERT model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForMaskedLM` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertModelForMaskedLM` (Bert model)
-                - isInstance of `openai-gpt` configuration class: :class:`~transformers.OpenAIGPTLMHeadModel` (OpenAI GPT model)
-                - isInstance of `gpt2` configuration class: :class:`~transformers.GPT2ModelLMHeadModel` (OpenAI GPT-2 model)
-                - isInstance of `ctrl` configuration class: :class:`~transformers.CTRLModelLMHeadModel` (Salesforce CTRL  model)
-                - isInstance of `transfo-xl` configuration class: :class:`~transformers.TransfoXLLMHeadModel` (Transformer-XL model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: :class:`~transformers.T5ModelWithLMHead` (T5 model)
-            - contains `distilbert`: :class:`~transformers.DistilBertForMaskedLM` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertForMaskedLM` (ALBERT model)
-            - contains `camembert`: :class:`~transformers.CamembertForMaskedLM` (CamemBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForMaskedLM` (XLM-RoBERTa model)
-            - contains `roberta`: :class:`~transformers.RobertaForMaskedLM` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.BertForMaskedLM` (Bert model)
-            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTLMHeadModel` (OpenAI GPT model)
-            - contains `gpt2`: :class:`~transformers.GPT2LMHeadModel` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: :class:`~transformers.TransfoXLLMHeadModel` (Transformer-XL model)
-            - contains `xlnet`: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
-            - contains `xlm`: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
-            - contains `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
-            - contains `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path:
-                Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model.
-                (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                  underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
-                  already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                  initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                  ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                  with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                  attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_WITH_LM_HEAD_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_WITH_LM_HEAD_MAPPING.keys())
-            )
-        )
-
-
-class AutoModelForSequenceClassification(object):
-    r"""
-        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModelForSequenceClassification is designed to be instantiated "
-            "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForSequenceClassification.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModelForSequenceClassification` (DistilBERT model)
-                - isInstance of `albert` configuration class: :class:`~transformers.AlbertModelForSequenceClassification` (ALBERT model)
-                - isInstance of `camembert` configuration class: :class:`~transformers.CamembertModelForSequenceClassification` (CamemBERT model)
-                - isInstance of `xlm roberta` configuration class: :class:`~transformers.XLMRobertaModelForSequenceClassification` (XLM-RoBERTa model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForSequenceClassification` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertModelForSequenceClassification` (Bert model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModelForSequenceClassification` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.XLMModelForSequenceClassification` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
-
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: :class:`~transformers.DistilBertForSequenceClassification` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertForSequenceClassification` (ALBERT model)
-            - contains `camembert`: :class:`~transformers.CamembertForSequenceClassification` (CamemBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForSequenceClassification` (XLM-RoBERTa model)
-            - contains `roberta`: :class:`~transformers.RobertaForSequenceClassification` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.BertForSequenceClassification` (Bert model)
-            - contains `xlnet`: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
-            - contains `flaubert`: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaining positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-
-class AutoModelForQuestionAnswering(object):
-    r"""
-        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
-        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModelForQuestionAnswering is designed to be instantiated "
-            "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForQuestionAnswering.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModelForQuestionAnswering` (DistilBERT model)
-                - isInstance of `albert` configuration class: :class:`~transformers.AlbertModelForQuestionAnswering` (ALBERT model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertModelForQuestionAnswering` (Bert model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModelForQuestionAnswering` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.XLMModelForQuestionAnswering` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: :class:`~transformers.DistilBertForQuestionAnswering` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.AlbertForQuestionAnswering` (ALBERT model)
-            - contains `bert`: :class:`~transformers.BertForQuestionAnswering` (Bert model)
-            - contains `xlnet`: :class:`~transformers.XLNetForQuestionAnswering` (XLNet model)
-            - contains `xlm`: :class:`~transformers.XLMForQuestionAnswering` (XLM model)
-            - contains `flaubert`: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()),
-            )
-        )
-
-
-class AutoModelForTokenClassification:
-    r"""
-        :class:`~transformers.AutoModelForTokenClassification` is a generic model class
-        that will be instantiated as one of the token classification model classes of the library
-        when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoModelForTokenClassification is designed to be instantiated "
-            "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForTokenClassification.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertModelForTokenClassification` (DistilBERT model)
-                - isInstance of `xlm roberta` configuration class: :class:`~transformers.XLMRobertaModelForTokenClassification` (XLMRoberta model)
-                - isInstance of `bert` configuration class: :class:`~transformers.BertModelForTokenClassification` (Bert model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModelForTokenClassification` (XLNet model)
-                - isInstance of `camembert` configuration class: :class:`~transformers.CamembertModelForTokenClassification` (Camembert model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForTokenClassification` (Roberta model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: :class:`~transformers.DistilBertForTokenClassification` (DistilBERT model)
-            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForTokenClassification` (XLM-RoBERTa?Para model)
-            - contains `camembert`: :class:`~transformers.CamembertForTokenClassification` (Camembert model)
-            - contains `bert`: :class:`~transformers.BertForTokenClassification` (Bert model)
-            - contains `xlnet`: :class:`~transformers.XLNetForTokenClassification` (XLNet model)
-            - contains `roberta`: :class:`~transformers.RobertaForTokenClassification` (Roberta model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path:
-                Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
diff --git a/server/transformers/src/transformers/modeling_bert.py b/server/transformers/src/transformers/modeling_bert.py
deleted file mode 100644
index caa056b64cb9869634c973cf525cdd4c6c7f88c9..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_bert.py
+++ /dev/null
@@ -1,1535 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PyTorch BERT model. """
-
-
-import logging
-import math
-import os
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from .configuration_bert import BertConfig
-from .modeling_utils import PreTrainedModel, prune_linear_layer, transpose_iterable
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-
-logger = logging.getLogger(__name__)
-
-BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
-    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/pytorch_model.bin",
-}
-
-
-def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-def mish(x):
-    return x * torch.tanh(nn.functional.softplus(x))
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
-
-
-BertLayerNorm = torch.nn.LayerNorm
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        if position_ids is None:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand(input_shape)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
-        self.output_additional_info = config.output_additional_info
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        if encoder_hidden_states is not None:
-            mixed_key_layer = self.key(encoder_hidden_states)
-            mixed_value_layer = self.value(encoder_hidden_states)
-            attention_mask = encoder_attention_mask
-        else:
-            mixed_key_layer = self.key(hidden_states)
-            mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        new_context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (new_context_layer,)
-        if self.output_attentions:
-            outputs += (attention_probs,)
-            if self.output_additional_info: # Only support additional info if attentions are desired
-                outputs += (context_layer,)
-
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
-        heads = set(heads) - self.pruned_heads  # Convert to set and remove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = BertAttention(config)
-        self.is_decoder = config.is_decoder
-        if self.is_decoder:
-            self.crossattention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(
-                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
-
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + outputs
-        return outputs
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.output_additional_info = config.output_additional_info
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        all_hidden_states = ()
-        all_attentions = ()
-        all_additional_info = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
-            )
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-                if self.output_additional_info:
-                    all_additional_info = all_additional_info + (layer_outputs[2],)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-            if self.output_additional_info:
-                outputs = outputs + (all_additional_info,)
-            
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states) + self.bias
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = BertConfig
-    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-BERT_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class BertModel(BertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
-
-    .. _`Attention is all you need`:
-        https://arxiv.org/abs/1706.03762
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
-
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertModel, BertTokenizer
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(
-                    torch.long
-                )  # not converting to long will cause errors with pytorch version < 1.3
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-
-            if encoder_attention_mask.dim() == 3:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-            elif encoder_attention_mask.dim() == 2:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-            else:
-                raise ValueError(
-                    "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
-                        encoder_hidden_shape, encoder_attention_mask.shape
-                    )
-                )
-
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # fp16 compatibility
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
-    a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
-)
-class BertForPreTraining(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-        next_sentence_label=None,
-    ):
-        r"""
-        masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False
-            continuation before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForPreTraining
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForPreTraining.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        prediction_scores, seq_relationship_scores = outputs[:2]
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[
-            2:
-        ]  # add hidden states and attention if they are here
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-class BertForMaskedLM(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        lm_labels=None,
-    ):
-        r"""
-        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
-                Next token prediction loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-        Examples::
-
-            from transformers import BertTokenizer, BertForMaskedLM
-            import torch
-
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, masked_lm_labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        # Although this may seem awkward, BertForMaskedLM supports two scenarios:
-        # 1. If a tensor that contains the indices of masked labels is provided,
-        #    the cross-entropy is the MLM cross-entropy that measures the likelihood
-        #    of predictions for masked words.
-        # 2. If `lm_labels` is provided we are in a causal scenario where we
-        #    try to predict the next token for each input in the decoder.
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss,) + outputs
-
-        if lm_labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            lm_labels = lm_labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
-            outputs = (ltr_lm_loss,) + outputs
-
-        return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-)
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        next_sentence_label=None,
-    ):
-        r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
-            Next sequence prediction (classification) loss.
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForNextSentencePrediction
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        seq_relationship_scores = outputs[0]
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_score = self.cls(pooled_output)
-
-        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            outputs = (next_sentence_loss,) + outputs
-
-        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForSequenceClassification(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForMultipleChoice(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForMultipleChoice
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
-        """
-        num_choices = input_ids.shape[1]
-
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForTokenClassification(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForTokenClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
-)
-class BertForQuestionAnswering(BertPreTrainedModel):
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_camembert.py b/server/transformers/src/transformers/modeling_camembert.py
deleted file mode 100644
index 12877dff16fa22b32e6efa8f0870cc4abed93d54..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_camembert.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# coding=utf-8
-# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch CamemBERT model. """
-
-
-import logging
-
-from .configuration_camembert import CamembertConfig
-from .file_utils import add_start_docstrings
-from .modeling_roberta import (
-    RobertaForMaskedLM,
-    RobertaForMultipleChoice,
-    RobertaForSequenceClassification,
-    RobertaForTokenClassification,
-    RobertaModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
-    "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/pytorch_model.bin",
-    "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/pytorch_model.bin",
-}
-
-
-CAMEMBERT_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
-    CAMEMBERT_START_DOCSTRING,
-)
-class CamembertModel(RobertaModel):
-    """
-    This class overrides :class:`~transformers.RobertaModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
-)
-class CamembertForMaskedLM(RobertaForMaskedLM):
-    """
-    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    CAMEMBERT_START_DOCSTRING,
-)
-class CamembertForSequenceClassification(RobertaForSequenceClassification):
-    """
-    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    CAMEMBERT_START_DOCSTRING,
-)
-class CamembertForMultipleChoice(RobertaForMultipleChoice):
-    """
-    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    CAMEMBERT_START_DOCSTRING,
-)
-class CamembertForTokenClassification(RobertaForTokenClassification):
-    """
-    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/server/transformers/src/transformers/modeling_ctrl.py b/server/transformers/src/transformers/modeling_ctrl.py
deleted file mode 100644
index 40e076a4982ef388986b1f04aea8954f97624295..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_ctrl.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch CTRL model."""
-
-
-import logging
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-
-from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import Conv1D, PreTrainedModel
-
-
-logger = logging.getLogger(__name__)
-
-CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/seqlen256_v1.bin"}
-
-
-def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
-    return pos * angle_rates
-
-
-def positional_encoding(position, d_model_size, dtype):
-    # create the sinusoidal pattern for the positional encoding
-    angle_rads = angle_defn(
-        torch.arange(position, dtype=dtype).unsqueeze(1),
-        torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
-        d_model_size,
-    )
-
-    sines = torch.sin(angle_rads[:, 0::2])
-    cosines = torch.cos(angle_rads[:, 1::2])
-
-    pos_encoding = torch.cat([sines, cosines], dim=-1)
-    return pos_encoding
-
-
-def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
-    # calculate attention
-    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
-
-    dk = k.shape[-1]
-    scaled_attention_logits = matmul_qk / np.sqrt(dk)
-
-    if mask is not None:
-        nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
-        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
-
-    if attention_mask is not None:
-        # Apply the attention mask
-        scaled_attention_logits = scaled_attention_logits + attention_mask
-
-    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
-
-    # Mask heads if we want to
-    if head_mask is not None:
-        attention_weights = attention_weights * head_mask
-
-    output = torch.matmul(attention_weights, v)
-
-    return output, attention_weights
-
-
-class MultiHeadAttention(torch.nn.Module):
-    def __init__(self, d_model_size, num_heads, output_attentions=False):
-        super().__init__()
-        self.output_attentions = output_attentions
-        self.num_heads = num_heads
-        self.d_model_size = d_model_size
-
-        self.depth = int(d_model_size / self.num_heads)
-
-        self.Wq = torch.nn.Linear(d_model_size, d_model_size)
-        self.Wk = torch.nn.Linear(d_model_size, d_model_size)
-        self.Wv = torch.nn.Linear(d_model_size, d_model_size)
-
-        self.dense = torch.nn.Linear(d_model_size, d_model_size)
-
-    def split_into_heads(self, x, batch_size):
-        x = x.reshape(batch_size, -1, self.num_heads, self.depth)
-        return x.permute([0, 2, 1, 3])
-
-    def forward(self, v, k, q, mask, layer_past=None, attention_mask=None, head_mask=None):
-        batch_size = q.shape[0]
-
-        q = self.Wq(q)
-        k = self.Wk(k)
-        v = self.Wv(v)
-
-        q = self.split_into_heads(q, batch_size)
-        k = self.split_into_heads(k, batch_size)
-        v = self.split_into_heads(v, batch_size)
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            k = torch.cat((past_key, k), dim=-2)
-            v = torch.cat((past_value, v), dim=-2)
-        present = torch.stack((k, v))
-
-        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
-        scaled_attention = output[0].permute([0, 2, 1, 3])
-        attn = output[1]
-        original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
-        output = self.dense(original_size_attention)
-
-        outputs = (output, present)
-        if self.output_attentions:
-            outputs = outputs + (attn,)
-        return outputs
-
-
-def point_wise_feed_forward_network(d_model_size, dff):
-    return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))
-
-
-class EncoderLayer(torch.nn.Module):
-    def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False):
-        super().__init__()
-
-        self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions)
-        self.ffn = point_wise_feed_forward_network(d_model_size, dff)
-
-        self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6)
-        self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6)
-
-        self.dropout1 = torch.nn.Dropout(rate)
-        self.dropout2 = torch.nn.Dropout(rate)
-
-    def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None):
-        normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention(
-            normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
-        )
-        attn_output = attn_outputs[0]
-        attn_output = self.dropout1(attn_output)
-        out1 = x + attn_output
-
-        out2 = self.layernorm2(out1)
-        ffn_output = self.ffn(out2)
-        ffn_output = self.dropout2(ffn_output)
-        out2 = out1 + ffn_output
-
-        outputs = (out2,) + attn_outputs[1:]
-        return outputs
-
-
-class CTRLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = CTRLConfig
-    pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-CTRL_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-CTRL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-    CTRL_START_DOCSTRING,
-)
-class CTRLModel(CTRLPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_hidden_states = config.output_hidden_states
-        self.output_attentions = config.output_attentions
-        self.output_past = config.output_past
-
-        self.d_model_size = config.n_embd
-        self.num_layers = config.n_layer
-
-        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)
-
-        self.w = nn.Embedding(config.vocab_size, config.n_embd)
-
-        self.dropout = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList(
-            [
-                EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions)
-                for _ in range(config.n_layer)
-            ]
-        )
-        self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.w
-
-    def set_input_embeddings(self, new_embeddings):
-        self.w = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import CTRLTokenizer, CTRLModel
-        import torch
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = past[0][0].size(-2)
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Attention mask.
-        if attention_mask is not None:
-            attention_mask = attention_mask.view(-1, input_shape[-1])
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-            token_type_embeds = self.w(token_type_ids)
-            token_type_embeds *= np.sqrt(self.d_model_size)
-        else:
-            token_type_embeds = 0
-        position_ids = position_ids.view(-1, input_shape[-1])
-
-        if inputs_embeds is None:
-            inputs_embeds = self.w(input_ids)
-        # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
-        seq_len = input_shape[-1]
-        mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device)
-
-        inputs_embeds *= np.sqrt(self.d_model_size)
-
-        pos_embeds = self.pos_encoding[position_ids, :].to(inputs_embeds.device)
-
-        hidden_states = inputs_embeds + pos_embeds + token_type_embeds
-
-        hidden_states = self.dropout(hidden_states)
-
-        output_shape = input_shape + (inputs_embeds.size(-1),)
-        presents = ()
-        all_hidden_states = ()
-        all_attentions = []
-        for i, (h, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-            outputs = h(
-                hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
-            )
-            hidden_states, present = outputs[:2]
-            if self.output_past:
-                presents = presents + (present,)
-
-            if self.output_attentions:
-                all_attentions.append(outputs[2])
-
-        hidden_states = self.layernorm(hidden_states)
-        hidden_states = hidden_states.view(*output_shape)
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_past:
-            outputs = outputs + (presents,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
-            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-        return outputs
-
-
-@add_start_docstrings(
-    """The CTRL Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    CTRL_START_DOCSTRING,
-)
-class CTRLLMHeadModel(CTRLPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = CTRLModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        # only last token for inputs_ids if past is defined in kwargs
-        if "past" in kwargs and kwargs["past"]:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        inputs = {"input_ids": input_ids}
-        inputs.update(kwargs)
-        return inputs
-
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            past=past,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_distilbert.py b/server/transformers/src/transformers/modeling_distilbert.py
deleted file mode 100644
index be876f362f339f0a9b5ec4ff795d8196d8e53b9e..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_distilbert.py
+++ /dev/null
@@ -1,841 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch DistilBERT model
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-    and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
-"""
-
-
-import copy
-import logging
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-
-from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import PreTrainedModel, prune_linear_layer, transpose_iterable
-
-
-logger = logging.getLogger(__name__)
-
-
-DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
-    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
-    "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
-    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
-    "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
-}
-
-
-# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
-def gelu(x):
-    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
-    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
-    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-    out.detach_()
-    out.requires_grad = False
-
-
-class Embeddings(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
-        if config.sinusoidal_pos_embds:
-            create_sinusoidal_embeddings(
-                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
-            )
-
-        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
-        self.dropout = nn.Dropout(config.dropout)
-
-    def forward(self, input_ids):
-        """
-        Parameters
-        ----------
-        input_ids: torch.tensor(bs, max_seq_length)
-            The token ids to embed.
-
-        Outputs
-        -------
-        embeddings: torch.tensor(bs, max_seq_length, dim)
-            The embedded tokens (plus position embeddings, no token_type embeddings)
-        """
-        seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)
-
-        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
-        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
-
-        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
-        return embeddings
-
-
-class MultiHeadSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.n_heads = config.n_heads
-        self.dim = config.dim
-        self.dropout = nn.Dropout(p=config.attention_dropout)
-        self.output_attentions = config.output_attentions
-        self.output_additional_info = config.output_additional_info
-
-        assert self.dim % self.n_heads == 0
-
-        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
-        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
-        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
-        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
-
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        attention_head_size = self.dim // self.n_heads
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_heads, attention_head_size)
-        heads = set(heads) - self.pruned_heads
-        for head in heads:
-            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        # Prune linear layers
-        self.q_lin = prune_linear_layer(self.q_lin, index)
-        self.k_lin = prune_linear_layer(self.k_lin, index)
-        self.v_lin = prune_linear_layer(self.v_lin, index)
-        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.dim = attention_head_size * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(self, query, key, value, mask, head_mask=None):
-        """
-        Parameters
-        ----------
-        query: torch.tensor(bs, seq_length, dim)
-        key: torch.tensor(bs, seq_length, dim)
-        value: torch.tensor(bs, seq_length, dim)
-        mask: torch.tensor(bs, seq_length)
-
-        Outputs
-        -------
-        weights: torch.tensor(bs, n_heads, seq_length, seq_length)
-            Attention weights
-        context: torch.tensor(bs, seq_length, dim)
-            Contextualized layer. Optional: only if `output_attentions=True`
-        """
-        bs, q_length, dim = query.size()
-        k_length = key.size(1)
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        # assert key.size() == value.size()
-
-        dim_per_head = self.dim // self.n_heads
-
-        mask_reshp = (bs, 1, 1, k_length)
-
-        def shape(x):
-            """ separate heads """
-            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
-
-        def unshape(x):
-            """ group heads """
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
-
-        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
-
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
-        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
-        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)
-
-        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
-        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
-        new_context = unshape(context)  # (bs, q_length, dim)
-        new_context = self.out_lin(new_context)  # (bs, q_length, dim)
-
-        output = (new_context,)
-
-        if self.output_attentions:
-            output += (weights,)
-
-            if self.output_additional_info:
-                output += (context,)
-
-                return output
-
-
-class FFN(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dropout = nn.Dropout(p=config.dropout)
-        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
-        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
-        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
-            config.activation
-        )
-        self.activation = gelu if config.activation == "gelu" else nn.ReLU()
-
-    def forward(self, input):
-        x = self.lin1(input)
-        x = self.activation(x)
-        x = self.lin2(x)
-        x = self.dropout(x)
-        return x
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.n_heads = config.n_heads
-        self.dim = config.dim
-        self.hidden_dim = config.hidden_dim
-        self.dropout = nn.Dropout(p=config.dropout)
-        self.activation = config.activation
-        self.output_attentions = config.output_attentions
-        self.output_additional_info = config.output_additional_info
-
-        assert config.dim % config.n_heads == 0
-
-        self.attention = MultiHeadSelfAttention(config)
-        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
-
-        self.ffn = FFN(config)
-        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
-
-    def forward(self, x, attn_mask=None, head_mask=None):
-        """
-        Parameters
-        ----------
-        x: torch.tensor(bs, seq_length, dim)
-        attn_mask: torch.tensor(bs, seq_length)
-
-        Outputs
-        -------
-        sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
-            The attention weights
-        ffn_output: torch.tensor(bs, seq_length, dim)
-            The output of the transformer block contextualization.
-        """
-        # Self-Attention
-        sa_raw_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
-        assert type(sa_raw_output) == tuple, "Expected output to be a tuple"
-        sa_output = sa_raw_output[0]
-        if self.output_attentions:
-            sa_weights = sa_raw_output[1]# (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-            if self.output_additional_info:
-                sa_additional_info = sa_raw_output[2]
-
-        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
-
-        # Feed Forward Network
-        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
-        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
-
-        output = (ffn_output,)
-        output = output + sa_raw_output[1:]
-        return output
-
-
-class Transformer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_layers = config.n_layers
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.output_additional_info = config.output_additional_info
-
-        layer = TransformerBlock(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
-
-    def forward(self, x, attn_mask=None, head_mask=None):
-        """
-        Parameters
-        ----------
-        x: torch.tensor(bs, seq_length, dim)
-            Input sequence embedded.
-        attn_mask: torch.tensor(bs, seq_length)
-            Attention mask on the sequence.
-
-        Outputs
-        -------
-        hidden_state: torch.tensor(bs, seq_length, dim)
-            Sequence of hiddens states in the last (top) layer
-        all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
-            Tuple of length n_layers with the hidden states from each layer.
-            Optional: only if output_hidden_states=True
-        all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
-            Tuple of length n_layers with the attention weights from each layer
-            Optional: only if output_attentions=True
-        """
-        all_hidden_states = ()
-        all_attentions = ()
-        all_additional_info = ()
-
-        hidden_state = x
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_state,)
-
-            layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i])
-            hidden_state = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-                if self.output_additional_info:
-                    all_additional_info = all_additional_info + (layer_outputs[2],)
-
-        outputs = (hidden_state,)
-        if self.output_hidden_states:
-            # Add last layer
-            all_hidden_states = all_hidden_states + (hidden_state,)
-            outputs = outputs + (all_hidden_states,)
-            
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-            if self.output_additional_info:
-                outputs = outputs + (all_additional_info,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
-class DistilBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = DistilBertConfig
-    pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = None
-    base_model_prefix = "distilbert"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, nn.Embedding):
-            if module.weight.requires_grad:
-                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-DISTILBERT_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-DISTILBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.DistilBertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertModel(DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embeddings = Embeddings(config)  # Embeddings
-        self.transformer = Transformer(config)  # Encoder
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embeddings.word_embeddings = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.transformer.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertModel
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
-        tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask)
-        hidden_state = tfmr_output[0]
-        output = (hidden_state,) + tfmr_output[1:]
-
-        return output  # last-layer hidden-state, (all hidden_states), (all attentions), (all additional info)
-
-
-@add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
-)
-class DistilBertForMaskedLM(DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.distilbert = DistilBertModel(config)
-        self.vocab_transform = nn.Linear(config.dim, config.dim)
-        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
-        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
-
-        self.init_weights()
-
-        self.mlm_loss_fct = nn.CrossEntropyLoss()
-
-    def get_output_embeddings(self):
-        return self.vocab_projector
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
-        r"""
-        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
-        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForMaskedLM
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
-        """
-        dlbrt_output = self.distilbert(
-            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
-        )
-        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
-        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
-
-        outputs = (prediction_logits,) + dlbrt_output[1:]
-        if masked_lm_labels is not None:
-            mlm_loss = self.mlm_loss_fct(
-                prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
-            )
-            outputs = (mlm_loss,) + outputs
-
-        return outputs  # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.distilbert = DistilBertModel(config)
-        self.pre_classifier = nn.Linear(config.dim, config.dim)
-        self.classifier = nn.Linear(config.dim, config.num_labels)
-        self.dropout = nn.Dropout(config.seq_classif_dropout)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
-        """
-        distilbert_output = self.distilbert(
-            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
-        )
-        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]  # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
-        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
-        pooled_output = self.dropout(pooled_output)  # (bs, dim)
-        logits = self.classifier(pooled_output)  # (bs, dim)
-
-        outputs = (logits,) + distilbert_output[1:]
-        if labels is not None:
-            if self.num_labels == 1:
-                loss_fct = nn.MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = nn.CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.distilbert = DistilBertModel(config)
-        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
-        assert config.num_labels == 2
-        self.dropout = nn.Dropout(config.qa_dropout)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:3]
-
-        """
-        distilbert_output = self.distilbert(
-            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
-        )
-        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
-
-        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
-        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)
-
-        outputs = (start_logits, end_logits,) + distilbert_output[1:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertForTokenClassification(DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.distilbert = DistilBertModel(config)
-        self.dropout = nn.Dropout(config.dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForTokenClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
-        """
-
-        outputs = self.distilbert(
-            input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_encoder_decoder.py b/server/transformers/src/transformers/modeling_encoder_decoder.py
deleted file mode 100644
index 0951baff7d4c3b207013850437b815cfddba7c9e..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_encoder_decoder.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Classes to support Encoder-Decoder architectures """
-
-
-import logging
-import os
-
-import torch
-from torch import nn
-
-from .modeling_auto import AutoModel, AutoModelWithLMHead
-
-
-logger = logging.getLogger(__name__)
-
-
-class PreTrainedEncoderDecoder(nn.Module):
-    r"""
-        :class:`~transformers.PreTrainedEncoderDecoder` is a generic model class that will be
-        instantiated as a transformer architecture with one of the base model
-        classes of the library as encoder and (optionally) another one as
-        decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-    """
-
-    def __init__(self, encoder, decoder):
-        super().__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        encoder_pretrained_model_name_or_path=None,
-        decoder_pretrained_model_name_or_path=None,
-        *model_args,
-        **kwargs
-    ):
-        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
-
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you need to first set it back in training mode with `model.train()`
-
-        Params:
-            encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments.
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-                You can specify kwargs sepcific for the encoder and decoder by prefixing the key with `encoder_` and `decoder_` respectively. (e.g. ``decoder_output_attention=True``). The remaining kwargs will be passed to both encoders and decoders.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            model = PreTrainedEncoderDecoder.from_pretained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
-        """
-
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as a whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = {
-            argument: value
-            for argument, value in kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
-        }
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_encoder.update(
-            {
-                argument[len("encoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("encoder_")
-            }
-        )
-        kwargs_decoder.update(
-            {
-                argument[len("decoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("decoder_")
-            }
-        )
-
-        # Load and initialize the encoder and decoder
-        # The distinction between encoder and decoder at the model level is made
-        # by the value of the flag `is_decoder` that we need to set correctly.
-        encoder = kwargs_encoder.pop("model", None)
-        if encoder is None:
-            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
-        encoder.config.is_decoder = False
-
-        decoder = kwargs_decoder.pop("model", None)
-        if decoder is None:
-            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
-        decoder.config.is_decoder = True
-
-        model = cls(encoder, decoder)
-
-        return model
-
-    def save_pretrained(self, save_directory):
-        """ Save a Seq2Seq model and its configuration file in a format such
-        that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained`
-
-        We save the encoder' and decoder's parameters in two separate directories.
-        """
-
-        # If the root output directory does not exist, create it
-        if not os.path.exists(save_directory):
-            os.mkdir(save_directory)
-
-        # Check whether the output directory is empty or not
-        sub_directories = [
-            directory
-            for directory in os.listdir(save_directory)
-            if os.path.isdir(os.path.join(save_directory, directory))
-        ]
-
-        if len(sub_directories) > 0:
-            if "encoder" in sub_directories and "decoder" in sub_directories:
-                print(
-                    "WARNING: there is an older version of encoder-decoder saved in"
-                    + " the output directory. The default behaviour is to overwrite them."
-                )
-
-            # Empty the output directory
-            for directory_to_remove in sub_directories:
-                # Remove all files into the subdirectory
-                files_to_remove = os.listdir(os.path.join(save_directory, directory_to_remove))
-                for file_to_remove in files_to_remove:
-                    os.remove(os.path.join(save_directory, directory_to_remove, file_to_remove))
-                # Remove the subdirectory itself
-                os.rmdir(os.path.join(save_directory, directory_to_remove))
-
-            assert len(os.listdir(save_directory)) == 0  # sanity check
-
-        # Create the "encoder" directory inside the output directory and save the encoder into it
-        if not os.path.exists(os.path.join(save_directory, "encoder")):
-            os.mkdir(os.path.join(save_directory, "encoder"))
-        self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
-
-        # Create the "encoder" directory inside the output directory and save the decoder into it
-        if not os.path.exists(os.path.join(save_directory, "decoder")):
-            os.mkdir(os.path.join(save_directory, "decoder"))
-        self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
-
-    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
-        """ The forward pass on a seq2eq depends what we are performing:
-
-        - During training we perform one forward pass through both the encoder
-          and decoder;
-        - During prediction, we perform one forward pass through the encoder,
-          and then perform several forward passes with the encoder's hidden
-          state through the decoder to decode a full sequence.
-
-        Therefore, we skip the forward pass on the encoder if an argument named
-        `encoder_hidden_state` is passed to this function.
-
-        Params:
-            encoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
-                Indices of encoder input sequence tokens in the vocabulary.
-            decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
-                Indices of decoder input sequence tokens in the vocabulary.
-            kwargs: (`optional`) Remaining dictionary of keyword arguments.
-        """
-        kwargs_encoder, kwargs_decoder = self.prepare_model_kwargs(**kwargs)
-
-        # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
-        if encoder_hidden_states is None:
-            encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
-        else:
-            encoder_outputs = ()
-
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        decoder_outputs = self.decoder(decoder_input_ids, encoder_hidden_states, **kwargs_decoder)
-
-        return decoder_outputs + encoder_outputs
-
-    @staticmethod
-    def prepare_model_kwargs(**kwargs):
-        """ Prepare the encoder and decoder's keyword arguments.
-
-        Keyword arguments come in 3 flavors:
-        - encoder-specific (prefixed by `encoder_`)
-        - decoder-specific (prefixed by `decoder_`)
-        - those that apply to the model as whole.
-
-        We let the specific kwargs override the common ones in case of
-        conflict.
-        """
-        kwargs_common = {
-            argument: value
-            for argument, value in kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
-        }
-        decoder_kwargs = kwargs_common.copy()
-        encoder_kwargs = kwargs_common.copy()
-        encoder_kwargs.update(
-            {
-                argument[len("encoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("encoder_")
-            }
-        )
-        decoder_kwargs.update(
-            {
-                argument[len("decoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("decoder_")
-            }
-        )
-        decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
-        return encoder_kwargs, decoder_kwargs
-
-
-class Model2Model(PreTrainedEncoderDecoder):
-    r"""
-        :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
-        where both of the encoder and decoder are of the same family. If the
-        name of or that path to a pretrained model is specified the encoder and
-        the decoder will be initialized with the pretrained weight (the
-        cross-attention will be intialized randomly if its weights are not
-        present).
-
-        It is possible to override this behavior and initialize, say, the decoder randomly
-        by creating it beforehand as follows
-
-            config = BertConfig.from_pretrained()
-            decoder = BertForMaskedLM(config)
-            model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Tying the encoder and decoders' embeddings together.
-
-       We need for each to get down to the embedding weights. However the
-        different model classes are inconsistent to that respect:
-        - BertModel: embeddings.word_embeddings
-        - RoBERTa: embeddings.word_embeddings
-        - XLMModel: embeddings
-        - GPT2: wte
-        - BertForMaskedLM: bert.embeddings.word_embeddings
-        - RobertaForMaskedLM: roberta.embeddings.word_embeddings
-
-        argument of the XEmbedding layer for each model, but it is "blocked"
-        by a model-specific keyword (bert, )...
-        """
-        # self._tie_or_clone_weights(self.encoder, self.decoder)
-        pass
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-
-        if (
-            "bert" not in pretrained_model_name_or_path
-            or "roberta" in pretrained_model_name_or_path
-            or "distilbert" in pretrained_model_name_or_path
-        ):
-            raise ValueError("Only the Bert model is currently supported.")
-
-        model = super().from_pretrained(
-            encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            *args,
-            **kwargs,
-        )
-
-        return model
-
-
-class Model2LSTM(PreTrainedEncoderDecoder):
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        if kwargs.get("decoder_model", None) is None:
-            # We will create a randomly initilized LSTM model as decoder
-            if "decoder_config" not in kwargs:
-                raise ValueError(
-                    "To load an LSTM in Encoder-Decoder model, please supply either: "
-                    "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or"
-                    "    - a dictionary of configuration parameters that will be used to initialize a"
-                    "      torch.nn.LSTM model as `decoder_config` keyword argument. "
-                    "      E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`"
-                )
-            kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
-        model = super().from_pretrained(*args, **kwargs)
-        return model
diff --git a/server/transformers/src/transformers/modeling_flaubert.py b/server/transformers/src/transformers/modeling_flaubert.py
deleted file mode 100644
index 6ec64ba8cc32990b63eceff4dd551fe261a83d63..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_flaubert.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Flaubert model, based on XLM. """
-
-
-import logging
-import random
-
-import torch
-from torch.nn import functional as F
-
-from .configuration_flaubert import FlaubertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_xlm import (
-    XLMForQuestionAnswering,
-    XLMForQuestionAnsweringSimple,
-    XLMForSequenceClassification,
-    XLMModel,
-    XLMWithLMHeadModel,
-    get_masks,
-)
-
-
-logger = logging.getLogger(__name__)
-
-FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/pytorch_model.bin",
-    "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/pytorch_model.bin",
-    "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/pytorch_model.bin",
-    "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/pytorch_model.bin",
-}
-
-
-FLAUBERT_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-FLAUBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Length of each sentence that can be used to avoid performing attention on padding token indices.
-            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
-            Indices selected in ``[0, ..., input_ids.size(-1)]``:
-        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`):
-            dictionary with ``torch.FloatTensor`` that contains pre-computed
-            hidden-states (key and values in the attention blocks) as computed by the model
-            (see `cache` output below). Can be used to speed up sequential decoding.
-            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
-    FLAUBERT_START_DOCSTRING,
-)
-class FlaubertModel(XLMModel):
-
-    config_class = FlaubertConfig
-    pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    def __init__(self, config):  # , dico, is_encoder, with_output):
-        super(FlaubertModel, self).__init__(config)
-        self.layerdrop = getattr(config, "layerdrop", 0.0)
-        self.pre_norm = getattr(config, "pre_norm", False)
-
-    @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
-        model = FlaubertModel.from_pretrained('flaubert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        # removed: src_enc=None, src_len=None
-        if input_ids is not None:
-            bs, slen = input_ids.size()
-        else:
-            bs, slen = inputs_embeds.size()[:-1]
-
-        if lengths is None:
-            if input_ids is not None:
-                lengths = (input_ids != self.pad_index).sum(dim=1).long()
-            else:
-                lengths = torch.LongTensor([slen] * bs)
-        # mask = input_ids != self.pad_index
-
-        # check inputs
-        assert lengths.size(0) == bs
-        assert lengths.max().item() <= slen
-        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
-        # assert (src_enc is None) == (src_len is None)
-        # if src_enc is not None:
-        #     assert self.is_decoder
-        #     assert src_enc.size(0) == bs
-
-        # generate masks
-        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
-        # if self.is_decoder and src_enc is not None:
-        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # position_ids
-        if position_ids is None:
-            position_ids = torch.arange(slen, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand((bs, slen))
-        else:
-            assert position_ids.size() == (bs, slen)  # (slen, bs)
-            # position_ids = position_ids.transpose(0, 1)
-
-        # langs
-        if langs is not None:
-            assert langs.size() == (bs, slen)  # (slen, bs)
-            # langs = langs.transpose(0, 1)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.n_layers
-
-        # do not recompute cached elements
-        if cache is not None and input_ids is not None:
-            _slen = slen - cache["slen"]
-            input_ids = input_ids[:, -_slen:]
-            position_ids = position_ids[:, -_slen:]
-            if langs is not None:
-                langs = langs[:, -_slen:]
-            mask = mask[:, -_slen:]
-            attn_mask = attn_mask[:, -_slen:]
-
-        # embeddings
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)
-
-        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
-        if langs is not None and self.use_lang_emb:
-            tensor = tensor + self.lang_embeddings(langs)
-        if token_type_ids is not None:
-            tensor = tensor + self.embeddings(token_type_ids)
-        tensor = self.layer_norm_emb(tensor)
-        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
-        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
-
-        # transformer layers
-        hidden_states = ()
-        attentions = ()
-        for i in range(self.n_layers):
-            # LayerDrop
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            if self.output_hidden_states:
-                hidden_states = hidden_states + (tensor,)
-
-            # self attention
-            if not self.pre_norm:
-                attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
-                attn = attn_outputs[0]
-                if self.output_attentions:
-                    attentions = attentions + (attn_outputs[1],)
-                attn = F.dropout(attn, p=self.dropout, training=self.training)
-                tensor = tensor + attn
-                tensor = self.layer_norm1[i](tensor)
-            else:
-                tensor_normalized = self.layer_norm1[i](tensor)
-                attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i])
-                attn = attn_outputs[0]
-                if self.output_attentions:
-                    attentions = attentions + (attn_outputs[1],)
-                attn = F.dropout(attn, p=self.dropout, training=self.training)
-                tensor = tensor + attn
-
-            # encoder attention (for decoder only)
-            # if self.is_decoder and src_enc is not None:
-            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
-            #     tensor = tensor + attn
-            #     tensor = self.layer_norm15[i](tensor)
-
-            # FFN
-            if not self.pre_norm:
-                tensor = tensor + self.ffns[i](tensor)
-                tensor = self.layer_norm2[i](tensor)
-            else:
-                tensor_normalized = self.layer_norm2[i](tensor)
-                tensor = tensor + self.ffns[i](tensor_normalized)
-
-            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
-
-        # Add last hidden state
-        if self.output_hidden_states:
-            hidden_states = hidden_states + (tensor,)
-
-        # update cache length
-        if cache is not None:
-            cache["slen"] += tensor.size(1)
-
-        # move back sequence length to dimension 0
-        # tensor = tensor.transpose(0, 1)
-
-        outputs = (tensor,)
-        if self.output_hidden_states:
-            outputs = outputs + (hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (attentions,)
-        return outputs  # outputs, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """The Flaubert Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    FLAUBERT_START_DOCSTRING,
-)
-class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
-    """
-    This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = FlaubertConfig
-    pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    def __init__(self, config):
-        super(FlaubertWithLMHeadModel, self).__init__(config)
-        self.transformer = FlaubertModel(config)
-        self.init_weights()
-
-
-@add_start_docstrings(
-    """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    FLAUBERT_START_DOCSTRING,
-)
-class FlaubertForSequenceClassification(XLMForSequenceClassification):
-    """
-    This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = FlaubertConfig
-    pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    def __init__(self, config):
-        super(FlaubertForSequenceClassification, self).__init__(config)
-        self.transformer = FlaubertModel(config)
-        self.init_weights()
-
-
-@add_start_docstrings(
-    """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    FLAUBERT_START_DOCSTRING,
-)
-class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
-    """
-    This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = FlaubertConfig
-    pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    def __init__(self, config):
-        super(FlaubertForQuestionAnsweringSimple, self).__init__(config)
-        self.transformer = FlaubertModel(config)
-        self.init_weights()
-
-
-@add_start_docstrings(
-    """Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    FLAUBERT_START_DOCSTRING,
-)
-class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
-    """
-    This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = FlaubertConfig
-    pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-    def __init__(self, config):
-        super(FlaubertForQuestionAnswering, self).__init__(config)
-        self.transformer = FlaubertModel(config)
-        self.init_weights()
diff --git a/server/transformers/src/transformers/modeling_gpt2.py b/server/transformers/src/transformers/modeling_gpt2.py
deleted file mode 100644
index 77027acd53b63a9c688a9ecdc00e54f5c3d737b5..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_gpt2.py
+++ /dev/null
@@ -1,757 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch OpenAI GPT-2 model."""
-
-
-import logging
-import math
-import os
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-
-from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer, transpose_iterable
-
-
-logger = logging.getLogger(__name__)
-
-GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
-    "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
-    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",
-}
-
-
-def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(gpt2_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array.squeeze())
-
-    for name, array in zip(names, arrays):
-        name = name[6:]  # skip "model/"
-        name = name.split("/")
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                scope_names = re.split(r"(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "w" or scope_names[0] == "g":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "b":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
-                pointer = getattr(pointer, scope_names[0])
-                pointer = getattr(pointer, "weight")
-            else:
-                pointer = getattr(pointer, scope_names[0])
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_additional_info = config.output_additional_info
-
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.c_attn = Conv1D(n_state * 3, nx)
-        self.c_proj = Conv1D(n_state, nx)
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_head, self.split_size // self.n_head)
-        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
-
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-
-        # Update hyper params
-        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
-        self.n_head = self.n_head - len(heads)
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
-        w = torch.matmul(q, k)
-        if self.scale:
-            w = w / math.sqrt(v.size(-1))
-        nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns - nd : ns, :ns]
-        w = w * b - 1e4 * (1 - b)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            w = w + attention_mask
-
-        w = nn.Softmax(dim=-1)(w)
-        w = self.attn_dropout(w)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        contexts = torch.matmul(w, v)
-        outputs = [contexts]
-        if self.output_attentions:
-            outputs.append(w)
-
-            if self.output_additional_info:
-                contexts = contexts.permute(0, 2, 1, 3).contiguous()
-                print("CONTEXTS: ", contexts.shape)
-                outputs.append(contexts)
-
-        return outputs
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
-        else:
-            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        x = self.c_attn(x)
-        query, key, value = x.split(self.split_size, dim=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
-        if layer_past is not None:
-            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
-            key = torch.cat((past_key, key), dim=-1)
-            value = torch.cat((past_value, value), dim=-2)
-        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-
-        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
-        a = attn_outputs[0]
-
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a)
-
-        outputs = [a, present] + attn_outputs[1:]
-        return outputs  # a, present, (attentions), (contexts)
-
-
-class MLP(nn.Module):
-    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
-        super().__init__()
-        nx = config.n_embd
-        self.c_fc = Conv1D(n_state, nx)
-        self.c_proj = Conv1D(nx, n_state)
-        self.act = gelu
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
-
-
-class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
-        super().__init__()
-        nx = config.n_embd
-        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale)
-        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(4 * nx, config)
-
-    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
-        output_attn = self.attn(
-            self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
-        )
-        a = output_attn[0]  # output_attn: a, present, (attentions)
-
-        x = x + a
-        m = self.mlp(self.ln_2(x))
-        x = x + m
-
-        outputs = [x] + output_attn[1:]
-        return outputs  # x, present, (attentions), (?contexts)
-
-
-class GPT2PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = GPT2Config
-    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_gpt2
-    base_model_prefix = "transformer"
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-GPT2_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-GPT2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-    GPT2_START_DOCSTRING,
-)
-class GPT2Model(GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_hidden_states = config.output_hidden_states
-        self.output_attentions = config.output_attentions
-        self.output_additional_info = config.output_additional_info
-        self.output_past = config.output_past
-
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
-        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
-        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import GPT2Tokenizer, GPT2Model
-        import torch
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = past[0][0].size(-2)
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Attention mask.
-        if attention_mask is not None:
-            attention_mask = attention_mask.view(-1, input_shape[-1])
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = ()
-        all_attentions = []
-        all_hidden_states = ()
-        all_additional_info = ()
-        for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-
-            outputs = block(
-                hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
-            )
-
-            hidden_states, present = outputs[:2]
-            if self.output_past:
-                presents = presents + (present,)
-
-            if self.output_attentions:
-                all_attentions.append(outputs[2])
-                if self.output_additional_info:
-                    all_additional_info = all_additional_info + (outputs[3],)
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(*output_shape)
-        # Add last hidden state
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_past:
-            outputs = outputs + (presents,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
-            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-            if self.output_additional_info:
-                outputs = outputs + (all_additional_info,)
-
-        return outputs  # last hidden state, (presents), (all hidden_states), (attentions), (contexts)
-
-
-@add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    GPT2_START_DOCSTRING,
-)
-class GPT2LMHeadModel(GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = GPT2Model(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        # only last token for inputs_ids if past is defined in kwargs
-        if "past" in kwargs and kwargs["past"]:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        inputs = {"input_ids": input_ids}
-        inputs.update(kwargs)
-        return inputs
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import torch
-        from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            past=past,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
-    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
-    The language modeling head has its weights tied to the input embeddings,
-    the classification head takes as input the input of a specified classification token index in the input sequence).
-""",
-    GPT2_START_DOCSTRING,
-)
-class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        config.num_labels = 1
-        self.transformer = GPT2Model(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.multiple_choice_head = SequenceSummary(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        mc_token_ids=None,
-        lm_labels=None,
-        mc_labels=None,
-    ):
-        r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
-            Language modeling loss.
-        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
-            Multiple choice classification loss.
-        lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import torch
-        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
-
-        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
-
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            past=past,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
-
-        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
-            outputs = (loss,) + outputs
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_mmbt.py b/server/transformers/src/transformers/modeling_mmbt.py
deleted file mode 100644
index a3aae3896585a454473cd34735e0606c155ce075..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_mmbt.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch MMBT model. """
-
-
-import logging
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from .file_utils import add_start_docstrings
-
-
-logger = logging.getLogger(__name__)
-
-
-class ModalEmbeddings(nn.Module):
-    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
-    """
-
-    def __init__(self, config, encoder, embeddings):
-        super().__init__()
-        self.config = config
-        self.encoder = encoder
-        self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
-        self.position_embeddings = embeddings.position_embeddings
-        self.token_type_embeddings = embeddings.token_type_embeddings
-        self.word_embeddings = embeddings.word_embeddings
-        self.LayerNorm = embeddings.LayerNorm
-        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
-
-    def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
-        token_embeddings = self.proj_embeddings(self.encoder(input_modal))
-        seq_length = token_embeddings.size(1)
-
-        if start_token is not None:
-            start_token_embeds = self.word_embeddings(start_token)
-            seq_length += 1
-            token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)
-
-        if end_token is not None:
-            end_token_embeds = self.word_embeddings(end_token)
-            seq_length += 1
-            token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)
-
-        if position_ids is None:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
-            position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
-
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(
-                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
-            )
-
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        embeddings = token_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-MMBT_START_DOCSTRING = r"""    MMBT model was proposed in
-    `Supervised Multimodal Bitransformers for Classifying Images and Text`_
-    by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders,
-    and obtain state-of-the-art performance on various multimodal classification benchmark tasks.
-
-    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
-    refer to the PyTorch documentation for all matter related to general usage and behavior.
-
-    .. _`Supervised Multimodal Bitransformers for Classifying Images and Text`:
-        https://github.com/facebookresearch/mmbt
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
-    Parameters:
-        config (:class:`~transformers.MMBTConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-        transformer (:class: `~nn.Module`): A text transformer that is used by MMBT.
-            It should have embeddings, encoder, and pooler attributes.
-        encoder (:class: `~nn.Module`): Encoder for the second modality.
-            It should take in a batch of modal inputs and return k, n dimension embeddings.
-"""
-
-MMBT_INPUTS_DOCSTRING = r"""    Inputs:
-        **input_modal**: ``torch.FloatTensor`` of shape ``(batch_size, ***)``:
-            The other modality data. It will be the shape that the encoder for that type expects.
-            e.g. With an Image Encoder, the shape would be (batch_size, channels, height, width)
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            It does not expect [CLS] token to be added as it's appended to the end of other modality embeddings.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **modal_start_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for Classification tasks.
-        **modal_end_tokens**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Segment token indices to indicate different portions of the inputs.
-        **modal_token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
-            Segment token indices to indicate different portions of the non-text modality.
-            The embeddings from these tokens will be summed with the respective token embeddings for the non-text modality.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-        **modal_position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, modal_sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
-            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
-            is configured as a decoder.
-        **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-"""
-
-
-@add_start_docstrings(
-    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
-    MMBT_START_DOCSTRING,
-    MMBT_INPUTS_DOCSTRING,
-)
-class MMBTModel(nn.Module):
-    r"""
-        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-            **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-                Sequence of hidden-states at the output of the last layer of the model.
-            **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during Bert pretraining. This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-                of shape ``(batch_size, sequence_length, hidden_size)``:
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            transformer = BertModel.from_pretrained('bert-base-uncased')
-            encoder = ImageEncoder(args)
-            mmbt = MMBTModel(config, transformer, encoder)
-        """
-
-    def __init__(self, config, transformer, encoder):
-        super().__init__()
-        self.config = config
-        self.transformer = transformer
-        self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
-
-    def forward(
-        self,
-        input_modal,
-        input_ids=None,
-        modal_start_tokens=None,
-        modal_end_tokens=None,
-        attention_mask=None,
-        token_type_ids=None,
-        modal_token_type_ids=None,
-        position_ids=None,
-        modal_position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_txt_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_txt_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        modal_embeddings = self.modal_encoder(
-            input_modal,
-            start_token=modal_start_tokens,
-            end_token=modal_end_tokens,
-            position_ids=modal_position_ids,
-            token_type_ids=modal_token_type_ids,
-        )
-
-        input_modal_shape = modal_embeddings.size()[:-1]
-
-        if token_type_ids is None:
-            token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
-
-        txt_embeddings = self.transformer.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-
-        embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
-
-        input_shape = embedding_output.size()[:-1]
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        else:
-            attention_mask = torch.cat(
-                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
-            )
-
-        if encoder_attention_mask is None:
-            encoder_attention_mask = torch.ones(input_shape, device=device)
-        else:
-            encoder_attention_mask = torch.cat(
-                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
-            )
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if attention_mask.dim() == 2:
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-            dtype=next(self.parameters()).dtype
-        )  # fp16 compatibility
-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        encoder_outputs = self.transformer.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-        )
-
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.transformer.pooler(sequence_output)
-
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-
-@add_start_docstrings(
-    """MMBT Model with a sequence classification/regression head on top (a linear layer on top of
-                      the pooled output)""",
-    MMBT_START_DOCSTRING,
-    MMBT_INPUTS_DOCSTRING,
-)
-class MMBTForClassification(nn.Module):
-    r"""
-            **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in ``[0, ..., config.num_labels - 1]``.
-                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-            **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-                Classification (or regression if config.num_labels==1) loss.
-            **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-                of shape ``(batch_size, sequence_length, hidden_size)``:
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            transformer = BertModel.from_pretrained('bert-base-uncased')
-            encoder = ImageEncoder(args)
-            model = MMBTForClassification(config, transformer, encoder)
-            outputs = model(input_modal, input_ids, labels=labels)
-            loss, logits = outputs[:2]
-        """
-
-    def __init__(self, config, transformer, encoder):
-        super().__init__()
-        self.num_labels = config.num_labels
-
-        self.mmbt = MMBTModel(config, transformer, encoder)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(
-        self,
-        input_modal,
-        input_ids=None,
-        modal_start_tokens=None,
-        modal_end_tokens=None,
-        attention_mask=None,
-        token_type_ids=None,
-        modal_token_type_ids=None,
-        position_ids=None,
-        modal_position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-
-        outputs = self.mmbt(
-            input_modal=input_modal,
-            input_ids=input_ids,
-            modal_start_tokens=modal_start_tokens,
-            modal_end_tokens=modal_end_tokens,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            modal_token_type_ids=modal_token_type_ids,
-            position_ids=position_ids,
-            modal_position_ids=modal_position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_openai.py b/server/transformers/src/transformers/modeling_openai.py
deleted file mode 100644
index 70abd5a1dc5fd060066fd34f02dbfbe20e343434..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_openai.py
+++ /dev/null
@@ -1,700 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch OpenAI GPT model."""
-
-
-import json
-import logging
-import math
-import os
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-
-from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
-
-
-logger = logging.getLogger(__name__)
-
-OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"
-}
-
-
-def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
-    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
-    """
-    import re
-    import numpy as np
-
-    if ".ckpt" in openai_checkpoint_folder_path:
-        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
-
-    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
-
-    with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
-        names = json.load(names_handle)
-    with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
-        shapes = json.load(shapes_handle)
-    offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
-    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
-    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-
-    # This was used when we had a single embedding matrix for positions and tokens
-    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
-    # del init_params[1]
-    init_params = [arr.squeeze() for arr in init_params]
-
-    try:
-        assert model.tokens_embed.weight.shape == init_params[1].shape
-        assert model.positions_embed.weight.shape == init_params[0].shape
-    except AssertionError as e:
-        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
-        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
-        raise
-
-    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
-    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
-    names.pop(0)
-    # Pop position and token embedding arrays
-    init_params.pop(0)
-    init_params.pop(0)
-
-    for name, array in zip(names, init_params):  # names[1:n_transfer], init_params[1:n_transfer]):
-        name = name[6:]  # skip "model/"
-        assert name[-2:] == ":0"
-        name = name[:-2]
-        name = name.split("/")
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                scope_names = re.split(r"(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "g":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "b":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "w":
-                pointer = getattr(pointer, "weight")
-            else:
-                pointer = getattr(pointer, scope_names[0])
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
-
-
-class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
-        super().__init__()
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.output_attentions = config.output_attentions
-
-        self.c_attn = Conv1D(n_state * 3, nx)
-        self.c_proj = Conv1D(n_state, nx)
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_head, self.split_size // self.n_head)
-        heads = set(heads) - self.pruned_heads
-        for head in heads:
-            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-        # Update hyper params
-        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
-        self.n_head = self.n_head - len(heads)
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
-        w = torch.matmul(q, k)
-        if self.scale:
-            w = w / math.sqrt(v.size(-1))
-        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
-        # XD: self.b may be larger than w, so we need to crop it
-        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + -1e4 * (1 - b)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            w = w + attention_mask
-
-        w = nn.Softmax(dim=-1)(w)
-        w = self.attn_dropout(w)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        outputs = [torch.matmul(w, v)]
-        if self.output_attentions:
-            outputs.append(w)
-        return outputs
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)
-        else:
-            return x.permute(0, 2, 1, 3)
-
-    def forward(self, x, attention_mask=None, head_mask=None):
-        x = self.c_attn(x)
-        query, key, value = x.split(self.split_size, dim=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
-
-        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
-        a = attn_outputs[0]
-
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a)
-
-        outputs = [a] + attn_outputs[1:]
-        return outputs  # a, (attentions)
-
-
-class MLP(nn.Module):
-    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
-        super().__init__()
-        nx = config.n_embd
-        self.c_fc = Conv1D(n_state, nx)
-        self.c_proj = Conv1D(nx, n_state)
-        self.act = ACT_FNS[config.afn]
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
-
-
-class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
-        super().__init__()
-        nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
-        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(4 * nx, config)
-        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
-
-    def forward(self, x, attention_mask=None, head_mask=None):
-        attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
-        a = attn_outputs[0]
-
-        n = self.ln_1(x + a)
-        m = self.mlp(n)
-        h = self.ln_2(n + m)
-
-        outputs = [h] + attn_outputs[1:]
-        return outputs
-
-
-class OpenAIGPTPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = OpenAIGPTConfig
-    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_openai_gpt
-    base_model_prefix = "transformer"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-OPENAI_GPT_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-OPENAI_GPT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
-    OPENAI_GPT_START_DOCSTRING,
-)
-class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
-        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.tokens_embed
-
-    def set_input_embeddings(self, new_embeddings):
-        self.tokens_embed = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if position_ids is None:
-            # Code is different from when we had a single embedding matrice from position and token embeddings
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(input_shape[-1], dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Attention mask.
-        if attention_mask is not None:
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        if inputs_embeds is None:
-            inputs_embeds = self.tokens_embed(input_ids)
-        position_embeds = self.positions_embed(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.tokens_embed(token_type_ids)
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        all_attentions = ()
-        all_hidden_states = ()
-        for i, block in enumerate(self.h):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-
-            outputs = block(hidden_states, attention_mask, head_mask[i])
-            hidden_states = outputs[0]
-            if self.output_attentions:
-                all_attentions = all_attentions + (outputs[1],)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
-
-        outputs = (hidden_states.view(*output_shape),)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last hidden state, (all hidden states), (all attentions)
-
-
-@add_start_docstrings(
-    """OpenAI GPT Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    OPENAI_GPT_START_DOCSTRING,
-)
-class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
-    """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
-
-
-@add_start_docstrings(
-    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
-    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
-    The language modeling head has its weights tied to the input embeddings,
-    the classification head takes as input the input of a specified classification token index in the input sequence).
-""",
-    OPENAI_GPT_START_DOCSTRING,
-)
-class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 1
-        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.multiple_choice_head = SequenceSummary(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        mc_token_ids=None,
-        lm_labels=None,
-        mc_labels=None,
-    ):
-        r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
-            Language modeling loss.
-        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
-            Multiple choice classification loss.
-        lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
-        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
-        model.resize_token_embeddings(len(tokenizer))
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
-
-    """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
-
-        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
-            outputs = (loss,) + outputs
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_roberta.py b/server/transformers/src/transformers/modeling_roberta.py
deleted file mode 100644
index 50de77b85c1428e770798dbc71b944882f4d55bd..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_roberta.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch RoBERTa model. """
-
-
-import logging
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
-
-
-logger = logging.getLogger(__name__)
-
-ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
-    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
-    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
-    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
-    "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
-    "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
-}
-
-
-class RobertaEmbeddings(BertEmbeddings):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.padding_idx = 1
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
-
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = self.create_position_ids_from_input_ids(input_ids).to(input_ids.device)
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
-
-        return super().forward(
-            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
-        )
-
-    def create_position_ids_from_input_ids(self, x):
-        """ Replace non-padding symbols with their position numbers. Position numbers begin at
-        padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
-        `utils.make_positions`.
-
-        :param torch.Tensor x:
-        :return torch.Tensor:
-        """
-        mask = x.ne(self.padding_idx).long()
-        incremental_indicies = torch.cumsum(mask, dim=1) * mask
-        return incremental_indicies + self.padding_idx
-
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
-        sequential position ids.
-
-        :param torch.Tensor inputs_embeds:
-        :return torch.Tensor:
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-
-        position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape)
-
-
-ROBERTA_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-ROBERTA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaModel(BertModel):
-    """
-    This class overrides :class:`~transformers.BertModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embeddings = RobertaEmbeddings(config)
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
-class RobertaForMaskedLM(BertPreTrainedModel):
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.roberta = RobertaModel(config)
-        self.lm_head = RobertaLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head.decoder
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-    ):
-        r"""
-        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMaskedLM
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
-        """
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        sequence_output = outputs[0]
-        prediction_scores = self.lm_head(sequence_output)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss,) + outputs
-
-        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
-
-
-class RobertaLMHead(nn.Module):
-    """Roberta Head for masked language modeling."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, features, **kwargs):
-        x = self.dense(features)
-        x = gelu(x)
-        x = self.layer_norm(x)
-
-        # project back to size of vocabulary with bias
-        x = self.decoder(x) + self.bias
-
-        return x
-
-
-@add_start_docstrings(
-    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaForSequenceClassification(BertPreTrainedModel):
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.roberta = RobertaModel(config)
-        self.classifier = RobertaClassificationHead(config)
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForSequenceClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
-        """
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaForMultipleChoice(BertPreTrainedModel):
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.roberta = RobertaModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMultipleChoice
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
-        """
-        num_choices = input_ids.shape[1]
-
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.roberta(
-            flat_input_ids,
-            position_ids=flat_position_ids,
-            token_type_ids=flat_token_type_ids,
-            attention_mask=flat_attention_mask,
-            head_mask=head_mask,
-        )
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Roberta Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaForTokenClassification(BertPreTrainedModel):
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.roberta = RobertaModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForTokenClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
-        """
-
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-class RobertaClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self, features, **kwargs):
-        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = torch.tanh(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-
-
-@add_start_docstrings(
-    """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaForQuestionAnswering(BertPreTrainedModel):
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.roberta = RobertaModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
-        # examples/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        """
-
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_t5.py b/server/transformers/src/transformers/modeling_t5.py
deleted file mode 100644
index 405ebe56674ee80d6414b218d5f9d4e16907ce97..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_t5.py
+++ /dev/null
@@ -1,915 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch T5 model. """
-
-
-import copy
-import itertools
-import logging
-import math
-import os
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from .configuration_t5 import T5Config
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
-from .modeling_utils import PreTrainedModel, prune_linear_layer
-
-
-logger = logging.getLogger(__name__)
-
-####################################################
-# This dict contrains shortcut names and associated url
-# for the pretrained weights provided with the models
-####################################################
-T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
-    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
-    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
-    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
-}
-
-
-####################################################
-# This is a conversion method from TF 1.0 to PyTorch
-# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-####################################################
-def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        tf_weights[name] = array
-
-    for txt_name in names:
-        name = txt_name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            logger.info("Skipping {}".format("/".join(name)))
-            tf_weights.pop(txt_name, None)
-            continue
-        if "_slot_" in name[-1]:
-            logger.info("Skipping {}".format("/".join(name)))
-            tf_weights.pop(txt_name, None)
-            continue
-        pointer = model
-        array = tf_weights[txt_name]
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] in ["kernel", "scale", "embedding"]:
-                pointer = getattr(pointer, "weight")
-            # elif scope_names[0] == 'scale':
-            #     pointer = getattr(pointer, 'weight')
-            # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
-            #     pointer = getattr(pointer, 'bias')
-            # elif scope_names[0] == 'squad':
-            #     pointer = getattr(pointer, 'classifier')
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if scope_names[0] not in ["kernel", "scale", "embedding"]:
-            pointer = getattr(pointer, "weight")
-        if scope_names[0] != "embedding":
-            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array.astype(np.float32))
-        tf_weights.pop(txt_name, None)
-
-    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
-    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
-    return model
-
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
-####################################################
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """ Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x):
-        variance = x.pow(2).mean(-1, keepdim=True)
-        x = x / torch.sqrt(variance + self.variance_epsilon)
-        return self.weight * x
-
-
-class T5DenseReluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        h = self.wi(hidden_states)
-        h = F.relu(h)
-        h = self.dropout(h)
-        h = self.wo(h)
-        return h
-
-
-class T5LayerFF(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.DenseReluDense = T5DenseReluDense(config)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        norm_x = self.layer_norm(hidden_states)
-        y = self.DenseReluDense(norm_x)
-        layer_output = hidden_states + self.dropout(y)
-        return layer_output
-
-
-class T5Attention(nn.Module):
-    NEW_ID = itertools.count()
-
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.layer_id = next(T5Attention.NEW_ID)
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-
-        self.output_attentions = config.output_attentions
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.d_model = config.d_model
-        self.d_kv = config.d_kv
-        self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
-        self.inner_dim = self.n_heads * self.d_kv
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_heads, self.d_kv)
-        heads = set(heads) - self.pruned_heads
-        for head in heads:
-            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        # Prune linear layers
-        self.q = prune_linear_layer(self.q, index)
-        self.k = prune_linear_layer(self.k, index)
-        self.v = prune_linear_layer(self.v, index)
-        self.o = prune_linear_layer(self.o, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.inner_dim = self.d_kv * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention.
-        The relative position is defined as memory_position - query_position, i.e.
-        the distance in tokens from the attending position to the attended-to
-        position.  If bidirectional=False, then positive relative positions are
-        invalid.
-        We use smaller buckets for small absolute relative_position and larger buckets
-        for larger absolute relative_positions.  All relative positions >=max_distance
-        map to the same bucket.  All relative positions <=-max_distance map to the
-        same bucket.  This should allow for more graceful generalization to longer
-        sequences than the model has been trained on.
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32
-            values in the range [0, num_buckets)
-        """
-        ret = 0
-        n = -relative_position
-        if bidirectional:
-            num_buckets //= 2
-            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
-            n = torch.abs(n)
-        else:
-            n = torch.max(n, torch.zeros_like(n))
-        # now n is in the range [0, inf)
-
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = n < max_exact
-
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        val_if_large = max_exact + (
-            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).to(torch.long)
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
-
-    def compute_bias(self, qlen, klen):
-        """ Compute binned relative position bias """
-        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
-        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
-        relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(
-            relative_position,  # shape (qlen, klen)
-            bidirectional=not self.is_decoder,
-            num_buckets=self.relative_attention_num_buckets,
-        )
-        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
-        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
-        return values
-
-    def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
-        """
-        Self-attention (if kv is None) or attention over source sentence (provided by kv).
-        """
-        # Input is (bs, qlen, dim)
-        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = input.size()
-        if kv is None:
-            klen = qlen if cache is None else cache["slen"] + qlen
-        else:
-            klen = kv.size(1)
-
-        def shape(x):
-            """  projection """
-            return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
-
-        def unshape(x):
-            """  compute context """
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
-
-        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
-        if kv is None:
-            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
-        elif cache is None or self.layer_id not in cache:
-            k = v = kv
-            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
-
-        if cache is not None:
-            if self.layer_id in cache:
-                if kv is None:
-                    k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                else:
-                    k, v = cache[self.layer_id]
-            cache[self.layer_id] = (k, v)
-
-        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = torch.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
-
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                raise ValueError("No position_bias provided and no weights to compute position_bias")
-            position_bias = self.compute_bias(qlen, klen)
-            if mask is not None:
-                position_bias = position_bias + mask  # (bs, n_heads, qlen, klen)
-
-        scores += position_bias
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
-        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, qlen, dim)
-
-        context = self.o(context)
-
-        outputs = (context,)
-        if self.output_attentions:
-            outputs = outputs + (weights,)
-        if self.has_relative_attention_bias:
-            outputs = outputs + (position_bias,)
-        return outputs
-
-
-class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
-        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
-        )
-        y = attention_output[0]
-        layer_output = hidden_states + self.dropout(y)
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
-        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
-        )
-        y = attention_output[0]
-        layer_output = hidden_states + self.dropout(y)
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5Block(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
-        if self.is_decoder:
-            self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
-            self.layer.append(T5LayerFF(config))
-        else:
-            self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        head_mask=None,
-    ):
-        self_attention_outputs = self.layer[0](
-            hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
-        )
-        hidden_states = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
-
-        if not self.is_decoder:
-            hidden_states = self.layer[1](hidden_states)
-        else:
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                kv=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask,
-            )
-            hidden_states = cross_attention_outputs[0]
-            outputs = (
-                outputs + cross_attention_outputs[1:]
-            )  # Keep cross-attention outputs and relative position weights
-            hidden_states = self.layer[2](hidden_states)
-
-        outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-
-
-class T5PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = T5Config
-    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_t5
-    base_model_prefix = "transformer"
-
-    @property
-    def dummy_inputs(self):
-        input_ids = torch.tensor(DUMMY_INPUTS)
-        input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {
-            "decoder_input_ids": input_ids,
-            "encoder_input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        factor = self.config.initializer_factor  # Used for testing weights initialization
-        if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
-            # Mesh TensorFlow embeddings initialization
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
-        elif isinstance(module, T5DenseReluDense):
-            # Mesh TensorFlow FF initialization
-            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
-            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, "bias") and module.wi.bias is not None:
-                module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5Attention):
-            # Mesh TensorFlow attention initialization to avoid scaling before softmax
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
-            d_model = self.config.d_model
-            d_kv = self.config.d_kv
-            n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
-            if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-
-
-class T5Stack(T5PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.is_decoder = config.is_decoder
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        self.init_weights()
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-    ):
-
-        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
-        if self.is_decoder and encoder_attention_mask is None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                seq_ids = torch.arange(seq_length, device=hidden_states.device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(attention_mask)
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -1e9 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
-        # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
-
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
-
-        if self.is_decoder:
-            # If a 2D ou 3D attention mask is provided for the cross-attention
-            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-            if encoder_attention_mask.dim() == 3:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-            if encoder_attention_mask.dim() == 2:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-
-            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
-            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
-
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # fp16 compatibility
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_layers
-
-        all_hidden_states = ()
-        all_attentions = ()
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(hidden_states)
-        for i, layer_module in enumerate(self.block):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask[i],
-            )
-            # layer_outputs is a tuple with:
-            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-            hidden_states = layer_outputs[0]
-            if i == 0:
-                # We share the position biases between the layers - the first layer store them
-                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-                position_bias = layer_outputs[2 if self.output_attentions else 1]
-                if self.is_decoder:
-                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-T5_START_DOCSTRING = r"""    The T5 model was proposed in
-    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
-    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
-    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
-
-    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
-    refer to the PyTorch documentation for all matter related to general usage and behavior.
-
-    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
-        https://arxiv.org/abs/1910.10683
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
-    Parameters:
-        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-            T5 is a model with relative position embeddings so you should be able to pad the inputs on
-            the right or the left.
-
-            Indices can be obtained using :class:`transformers.T5Tokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-"""
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
-    T5_START_DOCSTRING,
-    T5_INPUTS_DOCSTRING,
-)
-class T5Model(T5PreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5Model.from_pretrained('t5-small')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        self.encoder = T5Stack(encoder_config)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        self.decoder = T5Stack(decoder_config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self, **kwargs):
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_common = dict(
-            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
-        )
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
-
-        # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
-        encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
-        if encoder_hidden_states is None:
-            # Convert encoder inputs in embeddings if needed
-            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
-            if hidden_states is None:
-                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
-
-            if encoder_attention_mask is not None:
-                # Apply masking
-                encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
-                hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
-
-            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
-        else:
-            encoder_outputs = ()
-
-        # Decode
-        # Convert decoder inputs in embeddings if needed
-        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
-        if hidden_states is None:
-            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-            hidden_states = self.shared(decoder_inputs_ids)
-
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
-        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
-
-        return decoder_outputs + encoder_outputs
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
-class T5WithLMHeadModel(T5PreTrainedModel):
-    r"""
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5WithLMHeadModel.from_pretrained('t5-small')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids=input_ids, lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        self.encoder = T5Stack(encoder_config)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        self.decoder = T5Stack(decoder_config)
-
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def forward(self, **kwargs):
-        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
-        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
-        # that apply to the model as whole.
-        # We let the specific kwargs override the common ones in case of conflict.
-
-        lm_labels = kwargs.pop("decoder_lm_labels", None)
-
-        kwargs_common = dict(
-            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
-        )
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
-
-        # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
-        if encoder_hidden_states is None:
-            # Convert encoder inputs in embeddings if needed
-            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
-            if hidden_states is None:
-                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
-
-            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
-        else:
-            encoder_outputs = ()
-
-        # Decode
-        # Convert decoder inputs in embeddings if needed
-        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
-        if hidden_states is None:
-            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-            hidden_states = self.shared(decoder_inputs_ids)
-
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
-        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
-
-        sequence_output = decoder_outputs[0]
-        # Rescale output before projecting on vocab
-        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-        sequence_output = sequence_output * (self.model_dim ** -0.5)
-        lm_logits = self.lm_head(sequence_output)
-
-        decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            decoder_outputs = (
-                loss,
-            ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        return decoder_outputs + encoder_outputs
diff --git a/server/transformers/src/transformers/modeling_tf_albert.py b/server/transformers/src/transformers/modeling_tf_albert.py
deleted file mode 100644
index 2a1d3f1c4d8ae1844c8de05dbac242ba6d85f042..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_albert.py
+++ /dev/null
@@ -1,814 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 ALBERT model. """
-
-
-import logging
-
-import tensorflow as tf
-
-from .configuration_albert import AlbertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
-    "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
-    "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
-    "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
-    "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
-    "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
-    "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
-    "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
-}
-
-
-class TFAlbertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-        self.config = config
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            config.embedding_size,
-            embeddings_initializer=get_initializer(self.config.initializer_range),
-            name="position_embeddings",
-        )
-        self.token_type_embeddings = tf.keras.layers.Embedding(
-            config.type_vocab_size,
-            config.embedding_size,
-            embeddings_initializer=get_initializer(self.config.initializer_range),
-            name="token_type_embeddings",
-        )
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        with tf.name_scope("word_embeddings"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.config.vocab_size, self.config.embedding_size],
-                initializer=get_initializer(self.config.initializer_range),
-            )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding", training=False):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs, training=training)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
-
-    def _embedding(self, inputs, training=False):
-        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
-
-        if input_ids is not None:
-            input_shape = shape_list(input_ids)
-        else:
-            input_shape = shape_list(inputs_embeds)[:-1]
-
-        seq_length = input_shape[1]
-        if position_ids is None:
-            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        if inputs_embeds is None:
-            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings, training=training)
-        return embeddings
-
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, embedding_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
-        """
-        batch_size = shape_list(inputs)[0]
-        length = shape_list(inputs)[1]
-        x = tf.reshape(inputs, [-1, self.config.embedding_size])
-        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
-        return tf.reshape(logits, [batch_size, length, self.config.vocab_size])
-
-
-class TFAlbertSelfAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
-        )
-        self.key = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
-        )
-        self.value = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
-        )
-
-        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        batch_size = shape_list(hidden_states)[0]
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
-        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        # (batch size, num_heads, seq_len_q, seq_len_k)
-        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
-        # scale attention_scores
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
-        attention_scores = attention_scores / tf.math.sqrt(dk)
-
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = tf.matmul(attention_probs, value_layer)
-
-        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(
-            context_layer, (batch_size, -1, self.all_head_size)
-        )  # (batch_size, seq_len_q, all_head_size)
-
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
-
-
-class TFAlbertSelfOutput(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def call(self, inputs, training=False):
-        hidden_states, input_tensor = inputs
-
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class TFAlbertAttention(TFBertSelfAttention):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-
-        self.hidden_size = config.hidden_size
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    def call(self, inputs, training=False):
-        input_tensor, attention_mask, head_mask = inputs
-
-        batch_size = shape_list(input_tensor)[0]
-        mixed_query_layer = self.query(input_tensor)
-        mixed_key_layer = self.key(input_tensor)
-        mixed_value_layer = self.value(input_tensor)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
-        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        # (batch size, num_heads, seq_len_q, seq_len_k)
-        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
-        # scale attention_scores
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
-        attention_scores = attention_scores / tf.math.sqrt(dk)
-
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = tf.matmul(attention_probs, value_layer)
-
-        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(
-            context_layer, (batch_size, -1, self.all_head_size)
-        )  # (batch_size, seq_len_q, all_head_size)
-
-        self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-
-        hidden_states = self_outputs[0]
-
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        attention_output = self.LayerNorm(hidden_states + input_tensor)
-
-        # add attentions if we output them
-        outputs = (attention_output,) + self_outputs[1:]
-        return outputs
-
-
-class TFAlbertLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.attention = TFAlbertAttention(config, name="attention")
-
-        self.ffn = tf.keras.layers.Dense(
-            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
-        )
-
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
-
-        self.ffn_output = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
-        )
-        self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
-            epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
-        )
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
-        ffn_output = self.ffn(attention_outputs[0])
-        ffn_output = self.activation(ffn_output)
-        ffn_output = self.ffn_output(ffn_output)
-
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0])
-
-        # add attentions if we output them
-        outputs = (hidden_states,) + attention_outputs[1:]
-        return outputs
-
-
-class TFAlbertLayerGroup(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.albert_layers = [
-            TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
-        ]
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        layer_hidden_states = ()
-        layer_attentions = ()
-
-        for layer_index, albert_layer in enumerate(self.albert_layers):
-            layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training)
-            hidden_states = layer_output[0]
-
-            if self.output_attentions:
-                layer_attentions = layer_attentions + (layer_output[1],)
-
-            if self.output_hidden_states:
-                layer_hidden_states = layer_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (layer_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (layer_attentions,)
-        # last-layer hidden state, (layer hidden states), (layer attentions)
-        return outputs
-
-
-class TFAlbertTransformer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-        self.config = config
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
-            config.hidden_size,
-            kernel_initializer=get_initializer(config.initializer_range),
-            name="embedding_hidden_mapping_in",
-        )
-        self.albert_layer_groups = [
-            TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i))
-            for i in range(config.num_hidden_groups)
-        ]
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
-        all_attentions = ()
-
-        if self.output_hidden_states:
-            all_hidden_states = (hidden_states,)
-
-        for i in range(self.config.num_hidden_layers):
-            # Number of layers in a hidden group
-            layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
-
-            # Index of the hidden group
-            group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
-
-            layer_group_output = self.albert_layer_groups[group_idx](
-                [
-                    hidden_states,
-                    attention_mask,
-                    head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
-                ],
-                training=training,
-            )
-            hidden_states = layer_group_output[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + layer_group_output[-1]
-
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-
-        # last-layer hidden state, (all hidden states), (all attentions)
-        return outputs
-
-
-class TFAlbertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = AlbertConfig
-    pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "albert"
-
-
-class TFAlbertMLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-
-        self.dense = tf.keras.layers.Dense(
-            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
-
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        self.decoder_bias = self.add_weight(
-            shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
-        )
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-ALBERT_START_DOCSTRING = r"""
-    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
-    Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
-        https://arxiv.org/abs/1909.11942
-
-    .. _`tf.keras.Model`:
-        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Args:
-        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-ALBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
-    ALBERT_START_DOCSTRING,
-)
-class TFAlbertModel(TFAlbertPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-        self.num_hidden_layers = config.num_hidden_layers
-
-        self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
-        self.encoder = TFAlbertTransformer(config, name="encoder")
-        self.pooler = tf.keras.layers.Dense(
-            config.hidden_size,
-            kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
-            name="pooler",
-        )
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        raise NotImplementedError
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Albert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertModel
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertModel.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if attention_mask is None:
-            attention_mask = tf.fill(input_shape, 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
-
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output[:, 0])
-
-        # add hidden_states and attentions if they are here
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
-        # sequence_output, pooled_output, (hidden_states), (attentions)
-        return outputs
-
-
-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
-class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
-
-        self.albert = TFAlbertModel(config, name="albert")
-        self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
-
-    def get_output_embeddings(self):
-        return self.albert.embeddings
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMaskedLM
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
-        """
-        outputs = self.albert(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-        prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))
-
-        # Add hidden states and attention if they are here
-        outputs = (prediction_scores,) + outputs[2:]
-
-        return outputs  # prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING,
-)
-class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.albert = TFAlbertModel(config, name="albert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        outputs = self.albert(inputs, **kwargs)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_auto.py b/server/transformers/src/transformers/modeling_tf_auto.py
deleted file mode 100644
index dd661006d09b1638488657aa0fd9d5cc801dad07..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_auto.py
+++ /dev/null
@@ -1,1092 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Auto Model class. """
-
-
-import logging
-from collections import OrderedDict
-
-from .configuration_auto import (
-    AlbertConfig,
-    AutoConfig,
-    BertConfig,
-    CTRLConfig,
-    DistilBertConfig,
-    GPT2Config,
-    OpenAIGPTConfig,
-    RobertaConfig,
-    T5Config,
-    TransfoXLConfig,
-    XLMConfig,
-    XLNetConfig,
-)
-from .configuration_utils import PretrainedConfig
-from .modeling_tf_albert import (
-    TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFAlbertForMaskedLM,
-    TFAlbertForSequenceClassification,
-    TFAlbertModel,
-)
-from .modeling_tf_bert import (
-    TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFBertForMaskedLM,
-    TFBertForPreTraining,
-    TFBertForQuestionAnswering,
-    TFBertForSequenceClassification,
-    TFBertForTokenClassification,
-    TFBertModel,
-)
-from .modeling_tf_ctrl import TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, TFCTRLLMHeadModel, TFCTRLModel
-from .modeling_tf_distilbert import (
-    TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFDistilBertForMaskedLM,
-    TFDistilBertForQuestionAnswering,
-    TFDistilBertForSequenceClassification,
-    TFDistilBertForTokenClassification,
-    TFDistilBertModel,
-)
-from .modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, TFGPT2LMHeadModel, TFGPT2Model
-from .modeling_tf_openai import TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
-from .modeling_tf_roberta import (
-    TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFRobertaForMaskedLM,
-    TFRobertaForSequenceClassification,
-    TFRobertaForTokenClassification,
-    TFRobertaModel,
-)
-from .modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, TFT5Model, TFT5WithLMHeadModel
-from .modeling_tf_transfo_xl import (
-    TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFTransfoXLLMHeadModel,
-    TFTransfoXLModel,
-)
-from .modeling_tf_xlm import (
-    TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFXLMForQuestionAnsweringSimple,
-    TFXLMForSequenceClassification,
-    TFXLMModel,
-    TFXLMWithLMHeadModel,
-)
-from .modeling_tf_xlnet import (
-    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    TFXLNetForQuestionAnsweringSimple,
-    TFXLNetForSequenceClassification,
-    TFXLNetForTokenClassification,
-    TFXLNetLMHeadModel,
-    TFXLNetModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
-    (key, value)
-    for pretrained_map in [
-        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
-    ]
-    for key, value, in pretrained_map.items()
-)
-
-TF_MODEL_MAPPING = OrderedDict(
-    [
-        (T5Config, TFT5Model),
-        (DistilBertConfig, TFDistilBertModel),
-        (AlbertConfig, TFAlbertModel),
-        (RobertaConfig, TFRobertaModel),
-        (BertConfig, TFBertModel),
-        (OpenAIGPTConfig, TFOpenAIGPTModel),
-        (GPT2Config, TFGPT2Model),
-        (TransfoXLConfig, TFTransfoXLModel),
-        (XLNetConfig, TFXLNetModel),
-        (XLMConfig, TFXLMModel),
-        (CTRLConfig, TFCTRLModel),
-    ]
-)
-
-TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
-    [
-        (T5Config, TFT5WithLMHeadModel),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
-        (AlbertConfig, TFAlbertForMaskedLM),
-        (RobertaConfig, TFRobertaForMaskedLM),
-        (BertConfig, TFBertForPreTraining),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLNetConfig, TFXLNetLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
-        (CTRLConfig, TFCTRLLMHeadModel),
-    ]
-)
-
-TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
-    [
-        (T5Config, TFT5WithLMHeadModel),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
-        (AlbertConfig, TFAlbertForMaskedLM),
-        (RobertaConfig, TFRobertaForMaskedLM),
-        (BertConfig, TFBertForMaskedLM),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLNetConfig, TFXLNetLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
-        (CTRLConfig, TFCTRLLMHeadModel),
-    ]
-)
-
-TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, TFDistilBertForSequenceClassification),
-        (AlbertConfig, TFAlbertForSequenceClassification),
-        (RobertaConfig, TFRobertaForSequenceClassification),
-        (BertConfig, TFBertForSequenceClassification),
-        (XLNetConfig, TFXLNetForSequenceClassification),
-        (XLMConfig, TFXLMForSequenceClassification),
-    ]
-)
-
-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, TFDistilBertForQuestionAnswering),
-        (BertConfig, TFBertForQuestionAnswering),
-        (XLNetConfig, TFXLNetForQuestionAnsweringSimple),
-        (XLMConfig, TFXLMForQuestionAnsweringSimple),
-    ]
-)
-
-TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, TFDistilBertForTokenClassification),
-        (RobertaConfig, TFRobertaForTokenClassification),
-        (BertConfig, TFBertForTokenClassification),
-        (XLNetConfig, TFXLNetForTokenClassification),
-    ]
-)
-
-
-class TFAutoModel(object):
-    r"""
-        :class:`~transformers.TFAutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
-        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: TFT5Model (T5 model)
-            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
-            - contains `roberta`: TFRobertaModel (RoBERTa model)
-            - contains `bert`: TFBertModel (Bert model)
-            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: TFXLNetModel (XLNet model)
-            - contains `xlm`: TFXLMModel (XLM model)
-            - contains `ctrl`: TFCTRLModel (CTRL model)
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModel is designed to be instantiated "
-            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModel.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
-                    - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model)
-                    - isInstance of `bert` configuration class: TFBertModel (Bert model)
-                    - isInstance of `openai-gpt` configuration class: TFOpenAIGPTModel (OpenAI GPT model)
-                    - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model)
-                    - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL  model)
-                    - isInstance of `transfo-xl` configuration class: TFTransfoXLModel (Transformer-XL model)
-                    - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model)
-                    - isInstance of `xlm` configuration class: TFXLMModel (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = TFAutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
-        from a pre-trained model configuration.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: TFT5Model (T5 model)
-            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
-            - contains `roberta`: TFRobertaModel (RoBERTa model)
-            - contains `bert`: TFTFBertModel (Bert model)
-            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: TFXLNetModel (XLNet model)
-            - contains `ctrl`: TFCTRLModel (CTRL model)
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
-
-            from_pt: (`Optional`) Boolean
-                Set to True if the Checkpoint is a PyTorch checkpoint.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_MAPPING.keys())
-            )
-        )
-
-
-class TFAutoModelForPreTraining(object):
-    r"""
-        :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModelForPreTraining is designed to be instantiated "
-            "using the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForPreTraining.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-        Args:
-            config (:class:`~transformers.PretrainedConfig`):
-                The model class to instantiate is selected based on the configuration class:
-
-                - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertModelForMaskedLM` (DistilBERT model)
-                - isInstance of `roberta` configuration class: :class:`~transformers.TFRobertaModelForMaskedLM` (RoBERTa model)
-                - isInstance of `bert` configuration class: :class:`~transformers.TFBertForPreTraining` (Bert model)
-                - isInstance of `openai-gpt` configuration class: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
-                - isInstance of `gpt2` configuration class: :class:`~transformers.TFGPT2ModelLMHeadModel` (OpenAI GPT-2 model)
-                - isInstance of `ctrl` configuration class: :class:`~transformers.TFCTRLModelLMHeadModel` (Salesforce CTRL  model)
-                - isInstance of `transfo-xl` configuration class: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
-                - isInstance of `xlnet` configuration class: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
-                - isInstance of `xlm` configuration class: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = TFAutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: :class:`~transformers.TFT5ModelWithLMHead` (T5 model)
-            - contains `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
-            - contains `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model)
-            - contains `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
-            - contains `bert`: :class:`~transformers.TFBertForPreTraining` (Bert model)
-            - contains `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
-            - contains `gpt2`: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
-            - contains `xlnet`: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
-            - contains `xlm`: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
-            - contains `ctrl`: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Args:
-            pretrained_model_name_or_path:
-                Either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model.
-                (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                  underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
-                  already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                  initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                  ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                  with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                  attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModelForPreTraining.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModelForPreTraining.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
-            )
-        )
-
-
-class TFAutoModelWithLMHead(object):
-    r"""
-        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: TFT5WithLMHeadModel (T5 model)
-            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
-            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
-            - contains `bert`: TFBertForMaskedLM (Bert model)
-            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
-            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
-            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
-            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
-            - contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModelWithLMHead is designed to be instantiated "
-            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelWithLMHead.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
-                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
-                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
-                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
-                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
-                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
-                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
-                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = TFAutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys())
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: TFT5WithLMHeadModel (T5 model)
-            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
-            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
-            - contains `bert`: TFBertForMaskedLM (Bert model)
-            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
-            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
-            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
-            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
-            - contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
-
-            from_pt: (`Optional`) Boolean
-                Set to True if the Checkpoint is a PyTorch checkpoint.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_WITH_LM_HEAD_MAPPING.keys())
-            )
-        )
-
-
-class TFAutoModelForSequenceClassification(object):
-    r"""
-        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
-            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
-            - contains `bert`: TFBertForSequenceClassification (Bert model)
-            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
-            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModelForSequenceClassification is designed to be instantiated "
-            "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForSequenceClassification.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
-                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
-                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
-                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
-            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
-            - contains `bert`: TFBertForSequenceClassification (Bert model)
-            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
-            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
-
-            from_pt: (`Optional`) Boolean
-                Set to True if the Checkpoint is a PyTorch checkpoint.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-
-class TFAutoModelForQuestionAnswering(object):
-    r"""
-        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
-        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
-            - contains `bert`: TFBertForQuestionAnswering (Bert model)
-            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
-            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
-
-        This class cannot be instantiated using `__init__()` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModelForQuestionAnswering is designed to be instantiated "
-            "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
-            "`TFAutoModelForQuestionAnswering.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
-                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
-                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
-            - contains `bert`: TFBertForQuestionAnswering (Bert model)
-            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
-            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
-
-            from_pt: (`Optional`) Boolean
-                Set to True if the Checkpoint is a PyTorch checkpoint.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys()),
-            )
-        )
-
-
-class TFAutoModelForTokenClassification:
-    def __init__(self):
-        raise EnvironmentError(
-            "TFAutoModelForTokenClassification is designed to be instantiated "
-            "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForTokenClassification.from_config(config)` methods."
-        )
-
-    @classmethod
-    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
-        from a configuration.
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
-                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
-                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model)
-                    - isInstance of `roberta` configuration class: RobteraModel (Roberta model)
-
-        Examples::
-
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = TFAutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        """
-        for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class(config)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
-        from a pre-trained model configuration.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertForTokenClassification (Bert model)
-            - contains `xlnet`: XLNetForTokenClassification (XLNet model)
-            - contains `distilbert`: DistilBertForTokenClassification (DistilBert model)
-            - contains `roberta`: RobertaForTokenClassification (Roberta model)
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = TFAutoModelForTokenClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        for config_class, model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items():
-            if isinstance(config, config_class):
-                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
-        raise ValueError(
-            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(c.__name__ for c in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
-            )
-        )
diff --git a/server/transformers/src/transformers/modeling_tf_bert.py b/server/transformers/src/transformers/modeling_tf_bert.py
deleted file mode 100644
index 01bc1c2be73afbf5cf44cd94c75db33c957d69f5..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_bert.py
+++ /dev/null
@@ -1,1163 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 BERT model. """
-
-
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_bert import BertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-
-TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tf_model.h5",
-}
-
-
-def gelu(x):
-    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
-    return x * cdf
-
-
-def gelu_new(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-        x: float Tensor to perform activation.
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
-
-
-def swish(x):
-    return x * tf.sigmoid(x)
-
-
-ACT2FN = {
-    "gelu": tf.keras.layers.Activation(gelu),
-    "relu": tf.keras.activations.relu,
-    "swish": tf.keras.layers.Activation(swish),
-    "gelu_new": tf.keras.layers.Activation(gelu_new),
-}
-
-
-class TFBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.hidden_size = config.hidden_size
-        self.initializer_range = config.initializer_range
-
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            config.hidden_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="position_embeddings",
-        )
-        self.token_type_embeddings = tf.keras.layers.Embedding(
-            config.type_vocab_size,
-            config.hidden_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="token_type_embeddings",
-        )
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        with tf.name_scope("word_embeddings"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.hidden_size],
-                initializer=get_initializer(self.initializer_range),
-            )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding", training=False):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs, training=training)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
-
-    def _embedding(self, inputs, training=False):
-        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
-
-        if input_ids is not None:
-            input_shape = shape_list(input_ids)
-        else:
-            input_shape = shape_list(inputs_embeds)[:-1]
-
-        seq_length = input_shape[1]
-        if position_ids is None:
-            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        if inputs_embeds is None:
-            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings, training=training)
-        return embeddings
-
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
-        """
-        batch_size = shape_list(inputs)[0]
-        length = shape_list(inputs)[1]
-
-        x = tf.reshape(inputs, [-1, self.hidden_size])
-        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
-
-        return tf.reshape(logits, [batch_size, length, self.vocab_size])
-
-
-class TFBertSelfAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
-        )
-        self.key = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
-        )
-        self.value = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
-        )
-
-        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        batch_size = shape_list(hidden_states)[0]
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
-        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(
-            query_layer, key_layer, transpose_b=True
-        )  # (batch size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)  # scale attention_scores
-        attention_scores = attention_scores / tf.math.sqrt(dk)
-
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = tf.matmul(attention_probs, value_layer)
-
-        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(
-            context_layer, (batch_size, -1, self.all_head_size)
-        )  # (batch_size, seq_len_q, all_head_size)
-
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
-
-
-class TFBertSelfOutput(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def call(self, inputs, training=False):
-        hidden_states, input_tensor = inputs
-
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class TFBertAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.self_attention = TFBertSelfAttention(config, name="self")
-        self.dense_output = TFBertSelfOutput(config, name="output")
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    def call(self, inputs, training=False):
-        input_tensor, attention_mask, head_mask = inputs
-
-        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
-        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class TFBertIntermediate(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def call(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class TFBertOutput(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def call(self, inputs, training=False):
-        hidden_states, input_tensor = inputs
-
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class TFBertLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.attention = TFBertAttention(config, name="attention")
-        self.intermediate = TFBertIntermediate(config, name="intermediate")
-        self.bert_output = TFBertOutput(config, name="output")
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class TFBertEncoder(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # outputs, (hidden states), (attentions)
-
-
-class TFBertPooler(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size,
-            kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
-            name="dense",
-        )
-
-    def call(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        return pooled_output
-
-
-class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-
-    def call(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class TFBertLMPredictionHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.transform = TFBertPredictionHeadTransform(config, name="transform")
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-class TFBertMLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
-
-    def call(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class TFBertNSPHead(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.seq_relationship = tf.keras.layers.Dense(
-            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
-        )
-
-    def call(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class TFBertMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.num_hidden_layers = config.num_hidden_layers
-
-        self.embeddings = TFBertEmbeddings(config, name="embeddings")
-        self.encoder = TFBertEncoder(config, name="encoder")
-        self.pooler = TFBertPooler(config, name="pooler")
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if attention_mask is None:
-            attention_mask = tf.fill(input_shape, 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
-
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-class TFBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = BertConfig
-    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "bert"
-
-
-BERT_START_DOCSTRING = r"""
-    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
-    Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`__
-        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`__
-        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class TFBertModel(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.bert = TFBertMainLayer(config, name="bert")
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-        Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during Bert pretraining. This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-                tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-                tuple of :obj:`tf.Tensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-        Examples::
-
-            import tensorflow as tf
-            from transformers import BertTokenizer, TFBertModel
-
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = TFBertModel.from_pretrained('bert-base-uncased')
-            input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-            outputs = model(input_ids)
-            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-        """
-        outputs = self.bert(inputs, **kwargs)
-        return outputs
-
-
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training:
-    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
-)
-class TFBertForPreTraining(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.nsp = TFBertNSPHead(config, name="nsp___cls")
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
-
-    def get_output_embeddings(self):
-        return self.bert.embeddings
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForPreTraining
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
-        seq_relationship_score = self.nsp(pooled_output)
-
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[
-            2:
-        ]  # add hidden states and attention if they are here
-
-        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-class TFBertForMaskedLM(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
-
-    def get_output_embeddings(self):
-        return self.bert.embeddings
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMaskedLM
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        return outputs  # prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-)
-class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.nsp = TFBertNSPHead(config, name="nsp___cls")
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForNextSentencePrediction
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        seq_relationship_scores = outputs[0]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        pooled_output = outputs[1]
-        seq_relationship_score = self.nsp(pooled_output)
-
-        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
-)
-class TFBertForSequenceClassification(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForSequenceClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
-)
-class TFBertForMultipleChoice(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
-            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMultipleChoice
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
-        outputs = model(input_ids)
-        classification_scores = outputs[0]
-
-        """
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None:
-            num_choices = shape_list(input_ids)[1]
-            seq_length = shape_list(input_ids)[2]
-        else:
-            num_choices = shape_list(inputs_embeds)[1]
-            seq_length = shape_list(inputs_embeds)[2]
-
-        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
-        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
-        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
-        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
-
-        flat_inputs = [
-            flat_input_ids,
-            flat_attention_mask,
-            flat_token_type_ids,
-            flat_position_ids,
-            head_mask,
-            inputs_embeds,
-        ]
-
-        outputs = self.bert(flat_inputs, training=training)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=training)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = tf.reshape(logits, (-1, num_choices))
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
-)
-class TFBertForTokenClassification(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForTokenClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
-)
-class TFBertForQuestionAnswering(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForQuestionAnswering
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-        """
-        outputs = self.bert(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_camembert.py b/server/transformers/src/transformers/modeling_tf_camembert.py
deleted file mode 100644
index d6317cacfb5fc0fb2f05d99f33c7e2871fda2a2c..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_camembert.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 RoBERTa model. """
-
-
-import logging
-
-from .configuration_camembert import CamembertConfig
-from .file_utils import add_start_docstrings
-from .modeling_tf_roberta import (
-    TFRobertaForMaskedLM,
-    TFRobertaForSequenceClassification,
-    TFRobertaForTokenClassification,
-    TFRobertaModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {}
-
-
-CAMEMBERT_START_DOCSTRING = r"""
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
-    CAMEMBERT_START_DOCSTRING,
-)
-class TFCamembertModel(TFRobertaModel):
-    """
-    This class overrides :class:`~transformers.TFRobertaModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
-)
-class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
-    """
-    This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    CAMEMBERT_START_DOCSTRING,
-)
-class TFCamembertForSequenceClassification(TFRobertaForSequenceClassification):
-    """
-    This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """CamemBERT Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    CAMEMBERT_START_DOCSTRING,
-)
-class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
-    """
-    This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/server/transformers/src/transformers/modeling_tf_ctrl.py b/server/transformers/src/transformers/modeling_tf_ctrl.py
deleted file mode 100644
index 78e0c1113a8b9a796513711efa0ab682a4cd97e6..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_ctrl.py
+++ /dev/null
@@ -1,551 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 CTRL model."""
-
-
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"}
-
-
-def angle_defn(pos, i, d_model_size):
-    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
-    return pos * angle_rates
-
-
-def positional_encoding(position, d_model_size):
-    # create the sinusoidal pattern for the positional encoding
-    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
-
-    sines = np.sin(angle_rads[:, 0::2])
-    cosines = np.cos(angle_rads[:, 1::2])
-
-    # pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...], dtype=tf.float32)
-    pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
-    return pos_encoding
-
-
-def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
-    # calculate attention
-    matmul_qk = tf.matmul(q, k, transpose_b=True)
-
-    dk = tf.cast(shape_list(k)[-1], tf.float32)
-    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
-
-    if mask is not None:
-        scaled_attention_logits += mask * -1e4
-
-    if attention_mask is not None:
-        # Apply the attention mask
-        scaled_attention_logits = scaled_attention_logits + attention_mask
-
-    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
-
-    # Mask heads if we want to
-    if head_mask is not None:
-        attention_weights = attention_weights * head_mask
-
-    output = tf.matmul(attention_weights, v)
-
-    return output, attention_weights
-
-
-class TFMultiHeadAttention(tf.keras.layers.Layer):
-    def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = output_attentions
-        self.num_heads = num_heads
-        self.d_model_size = d_model_size
-
-        self.depth = int(d_model_size / self.num_heads)
-
-        self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
-        self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
-        self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
-
-        self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
-
-    def split_into_heads(self, x, batch_size):
-        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
-
-    def call(self, inputs, training=False):
-        v, k, q, mask, layer_past, attention_mask, head_mask = inputs
-        batch_size = shape_list(q)[0]
-
-        q = self.Wq(q)
-        k = self.Wk(k)
-        v = self.Wv(v)
-
-        q = self.split_into_heads(q, batch_size)
-        k = self.split_into_heads(k, batch_size)
-        v = self.split_into_heads(v, batch_size)
-        if layer_past is not None:
-            past_key, past_value = tf.unstack(layer_past, axis=1)
-            k = tf.concat((past_key, k), dim=-2)
-            v = tf.concat((past_value, v), dim=-2)
-        present = tf.stack((k, v), axis=1)
-
-        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
-        scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
-        attn = output[1]
-        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
-        output = self.dense(original_size_attention)
-
-        outputs = (output, present)
-        if self.output_attentions:
-            outputs = outputs + (attn,)
-        return outputs
-
-
-def point_wise_feed_forward_network(d_model_size, dff, name=""):
-    return tf.keras.Sequential(
-        [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
-        name="ffn",
-    )
-
-
-class TFEncoderLayer(tf.keras.layers.Layer):
-    def __init__(
-        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.multi_head_attention = TFMultiHeadAttention(
-            d_model_size, num_heads, output_attentions, name="multi_head_attention"
-        )
-        self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
-
-    def call(self, inputs, training=False):
-        x, mask, layer_past, attention_mask, head_mask = inputs
-        normed = self.layernorm1(x)
-        attn_outputs = self.multi_head_attention(
-            [normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training
-        )
-        attn_output = attn_outputs[0]
-        attn_output = self.dropout1(attn_output, training=training)
-        out1 = x + attn_output
-
-        out2 = self.layernorm2(out1)
-        ffn_output = self.ffn(out2)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = out1 + ffn_output
-
-        outputs = (out2,) + attn_outputs[1:]
-        return outputs
-
-
-class TFCTRLMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_hidden_states = config.output_hidden_states
-        self.output_attentions = config.output_attentions
-        self.output_past = config.output_past
-
-        self.d_model_size = config.n_embd
-        self.num_layers = config.n_layer
-
-        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
-
-        self.w = TFSharedEmbeddings(
-            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
-        )
-
-        self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [
-            TFEncoderLayer(
-                config.n_embd,
-                config.n_head,
-                config.dff,
-                config.resid_pdrop,
-                config.layer_norm_epsilon,
-                config.output_attentions,
-                name="h_._{}".format(i),
-            )
-            for i in range(config.n_layer)
-        ]
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
-
-    def get_input_embeddings(self):
-        return self.w
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            past = inputs[1] if len(inputs) > 1 else past
-            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
-            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
-            position_ids = inputs[4] if len(inputs) > 4 else position_ids
-            head_mask = inputs[5] if len(inputs) > 5 else head_mask
-            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
-            assert len(inputs) <= 7, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            past = inputs.get("past", past)
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 7, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = shape_list(past[0][0])[-2]
-        if position_ids is None:
-            position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
-            position_ids = tf.tile(position_ids, [input_shape[0], 1])
-
-        # Attention mask.
-        if attention_mask is not None:
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-
-            attention_mask = tf.cast(attention_mask, tf.float32)
-            attention_mask = (1.0 - attention_mask) * -10000.0
-        else:
-            attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_layers
-
-        if token_type_ids is not None:
-            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode="embedding")
-            token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
-        else:
-            token_type_embeds = 0
-        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
-
-        if inputs_embeds is None:
-            inputs_embeds = self.w(input_ids, mode="embedding")
-        seq_len = input_shape[-1]
-        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-
-        inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
-
-        pos_embeds = tf.gather(self.pos_encoding, position_ids)
-
-        hidden_states = inputs_embeds + pos_embeds + token_type_embeds
-
-        hidden_states = self.dropout(hidden_states, training=training)
-
-        output_shape = input_shape + [shape_list(hidden_states)[-1]]
-        presents = ()
-        all_hidden_states = ()
-        all_attentions = []
-        for i, (h, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
-            outputs = h([hidden_states, mask, layer_past, attention_mask, head_mask[i]], training=training)
-            hidden_states, present = outputs[:2]
-
-            if self.output_past:
-                presents = presents + (present,)
-
-            if self.output_attentions:
-                all_attentions.append(outputs[2])
-
-        hidden_states = self.layernorm(hidden_states)
-        hidden_states = tf.reshape(hidden_states, output_shape)
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_past:
-            outputs = outputs + (presents,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
-            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-        return outputs
-
-
-class TFCTRLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = CTRLConfig
-    pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-CTRL_START_DOCSTRING = r"""
-
-    .. note::
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-CTRL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
-    CTRL_START_DOCSTRING,
-)
-class TFCTRLModel(TFCTRLPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLModel.from_pretrained('ctrl')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-class TFCTRLLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-@add_start_docstrings(
-    """The CTRL Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    CTRL_START_DOCSTRING,
-)
-class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFCTRLMainLayer(config, name="transformer")
-
-        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
-
-    def get_output_embeddings(self):
-        return self.lm_head.input_embeddings
-
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
-        outputs = model(input_ids)
-        loss, logits = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-
-        return outputs  # lm_logits, presents, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_distilbert.py b/server/transformers/src/transformers/modeling_tf_distilbert.py
deleted file mode 100644
index 1dc8301730e8141e47d3883d8f843625b676cdd5..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_distilbert.py
+++ /dev/null
@@ -1,838 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 DistilBERT model
-"""
-
-
-import logging
-import math
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-
-TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
-    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
-    "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
-    "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
-}
-
-
-# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
-def gelu(x):
-    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
-    return x * cdf
-
-
-def gelu_new(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-        x: float Tensor to perform activation.
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
-
-
-class TFEmbeddings(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.dim = config.dim
-        self.initializer_range = config.initializer_range
-        self.word_embeddings = TFSharedEmbeddings(
-            config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
-        )  # padding_idx=0)
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            config.dim,
-            embeddings_initializer=get_initializer(config.initializer_range),
-            name="position_embeddings",
-        )
-        if config.sinusoidal_pos_embds:
-            raise NotImplementedError
-
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        with tf.name_scope("word_embeddings"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.word_embeddings = self.add_weight(
-                "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
-            )
-        super().build(input_shape)
-
-    def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
-
-    def _embedding(self, inputs, inputs_embeds=None, training=False):
-        """
-        Parameters
-        ----------
-        input_ids: tf.Tensor(bs, max_seq_length)
-            The token ids to embed.
-
-        Outputs
-        -------
-        embeddings: tf.Tensor(bs, max_seq_length, dim)
-            The embedded tokens (plus position embeddings, no token_type embeddings)
-        """
-        if not isinstance(inputs, (tuple, list)):
-            input_ids = inputs
-            position_ids = None
-        else:
-            input_ids, position_ids = inputs
-
-        if input_ids is not None:
-            seq_length = shape_list(input_ids)[1]
-        else:
-            seq_length = shape_list(inputs_embeds)[1]
-
-        if position_ids is None:
-            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
-
-        if inputs_embeds is None:
-            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
-        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
-
-        embeddings = inputs_embeds + position_embeddings  # (bs, max_seq_length, dim)
-        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
-        embeddings = self.dropout(embeddings, training=training)  # (bs, max_seq_length, dim)
-        return embeddings
-
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
-        """
-        batch_size = shape_list(inputs)[0]
-        length = shape_list(inputs)[1]
-
-        x = tf.reshape(inputs, [-1, self.dim])
-        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
-
-        return tf.reshape(logits, [batch_size, length, self.vocab_size])
-
-
-class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-        self.n_heads = config.n_heads
-        self.dim = config.dim
-        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
-        self.output_attentions = config.output_attentions
-
-        assert self.dim % self.n_heads == 0
-
-        self.q_lin = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
-        )
-        self.k_lin = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
-        )
-        self.v_lin = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
-        )
-        self.out_lin = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
-        )
-
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    def call(self, inputs, training=False):
-        """
-        Parameters
-        ----------
-        query: tf.Tensor(bs, seq_length, dim)
-        key: tf.Tensor(bs, seq_length, dim)
-        value: tf.Tensor(bs, seq_length, dim)
-        mask: tf.Tensor(bs, seq_length)
-
-        Outputs
-        -------
-        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
-            Attention weights
-        context: tf.Tensor(bs, seq_length, dim)
-            Contextualized layer. Optional: only if `output_attentions=True`
-        """
-        query, key, value, mask, head_mask = inputs
-        bs, q_length, dim = shape_list(query)
-        k_length = shape_list(key)[1]
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        # assert key.size() == value.size()
-
-        dim_per_head = self.dim // self.n_heads
-
-        mask_reshape = [bs, 1, 1, k_length]
-
-        def shape(x):
-            """ separate heads """
-            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
-
-        def unshape(x):
-            """ group heads """
-            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
-
-        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
-        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
-        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
-
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, q_length, k_length)
-        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
-        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
-        scores = scores - 1e30 * (1.0 - mask)
-
-        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, q_length, dim)
-        context = self.out_lin(context)  # (bs, q_length, dim)
-
-        if self.output_attentions:
-            return (context, weights)
-        else:
-            return (context,)
-
-
-class TFFFN(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.lin1 = tf.keras.layers.Dense(
-            config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
-        )
-        self.lin2 = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
-        )
-        assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
-            config.activation
-        )
-        self.activation = (
-            tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu
-        )
-
-    def call(self, input, training=False):
-        x = self.lin1(input)
-        x = self.activation(x)
-        x = self.lin2(x)
-        x = self.dropout(x, training=training)
-        return x
-
-
-class TFTransformerBlock(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-        self.n_heads = config.n_heads
-        self.dim = config.dim
-        self.hidden_dim = config.hidden_dim
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.activation = config.activation
-        self.output_attentions = config.output_attentions
-
-        assert config.dim % config.n_heads == 0
-
-        self.attention = TFMultiHeadSelfAttention(config, name="attention")
-        self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
-
-        self.ffn = TFFFN(config, name="ffn")
-        self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
-
-    def call(self, inputs, training=False):  # removed: src_enc=None, src_len=None
-        """
-        Parameters
-        ----------
-        x: tf.Tensor(bs, seq_length, dim)
-        attn_mask: tf.Tensor(bs, seq_length)
-
-        Outputs
-        -------
-        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
-            The attention weights
-        ffn_output: tf.Tensor(bs, seq_length, dim)
-            The output of the transformer block contextualization.
-        """
-        x, attn_mask, head_mask = inputs
-
-        # Self-Attention
-        sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
-        if self.output_attentions:
-            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
-        else:  # To handle these `output_attention` or `output_hidden_states` cases returning tuples
-            # assert type(sa_output) == tuple
-            sa_output = sa_output[0]
-        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)
-
-        # Feed Forward Network
-        ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
-        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
-
-        output = (ffn_output,)
-        if self.output_attentions:
-            output = (sa_weights,) + output
-        return output
-
-
-class TFTransformer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.n_layers = config.n_layers
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
-
-    def call(self, inputs, training=False):
-        """
-        Parameters
-        ----------
-        x: tf.Tensor(bs, seq_length, dim)
-            Input sequence embedded.
-        attn_mask: tf.Tensor(bs, seq_length)
-            Attention mask on the sequence.
-
-        Outputs
-        -------
-        hidden_state: tf.Tensor(bs, seq_length, dim)
-            Sequence of hiddens states in the last (top) layer
-        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
-            Tuple of length n_layers with the hidden states from each layer.
-            Optional: only if output_hidden_states=True
-        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
-            Tuple of length n_layers with the attention weights from each layer
-            Optional: only if output_attentions=True
-        """
-        x, attn_mask, head_mask = inputs
-
-        all_hidden_states = ()
-        all_attentions = ()
-
-        hidden_state = x
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_state,)
-
-            layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
-            hidden_state = layer_outputs[-1]
-
-            if self.output_attentions:
-                assert len(layer_outputs) == 2
-                attentions = layer_outputs[0]
-                all_attentions = all_attentions + (attentions,)
-            else:
-                assert len(layer_outputs) == 1
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_state,)
-
-        outputs = (hidden_state,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-class TFDistilBertMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.num_hidden_layers = config.num_hidden_layers
-
-        self.embeddings = TFEmbeddings(config, name="embeddings")  # Embeddings
-        self.transformer = TFTransformer(config, name="transformer")  # Encoder
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        raise NotImplementedError
-
-    def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            head_mask = inputs[2] if len(inputs) > 2 else head_mask
-            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
-            assert len(inputs) <= 4, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 4, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if attention_mask is None:
-            attention_mask = tf.ones(input_shape)  # (bs, seq_length)
-        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
-        tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
-
-        return tfmr_output  # last-layer hidden-state, (all hidden_states), (all attentions)
-
-
-# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
-class TFDistilBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = DistilBertConfig
-    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "distilbert"
-
-
-DISTILBERT_START_DOCSTRING = r"""
-    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
-    Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-DISTILBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-
-"""
-
-
-@add_start_docstrings(
-    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
-    DISTILBERT_START_DOCSTRING,
-)
-class TFDistilBertModel(TFDistilBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertModel
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-        """
-        outputs = self.distilbert(inputs, **kwargs)
-        return outputs
-
-
-class TFDistilBertLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-@add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
-)
-class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.vocab_size = config.vocab_size
-
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.vocab_transform = tf.keras.layers.Dense(
-            config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
-        )
-        self.act = tf.keras.layers.Activation(gelu)
-        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
-        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
-
-    def get_output_embeddings(self):
-        return self.vocab_projector.input_embeddings
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
-        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
-        """
-        distilbert_output = self.distilbert(inputs, **kwargs)
-
-        hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
-        prediction_logits = self.act(prediction_logits)  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
-        prediction_logits = self.vocab_projector(prediction_logits)
-
-        outputs = (prediction_logits,) + distilbert_output[1:]
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    DISTILBERT_START_DOCSTRING,
-)
-class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.pre_classifier = tf.keras.layers.Dense(
-            config.dim,
-            kernel_initializer=get_initializer(config.initializer_range),
-            activation="relu",
-            name="pre_classifier",
-        )
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
-        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        distilbert_output = self.distilbert(inputs, **kwargs)
-
-        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]  # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))  # (bs, dim)
-        logits = self.classifier(pooled_output)  # (bs, dim)
-
-        outputs = (logits,) + distilbert_output[1:]
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    DISTILBERT_START_DOCSTRING,
-)
-class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
-        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-        """
-        outputs = self.distilbert(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    DISTILBERT_START_DOCSTRING,
-)
-class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-        )
-        assert config.num_labels == 2
-        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
-
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
-        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-        """
-        distilbert_output = self.distilbert(inputs, **kwargs)
-
-        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
-        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))  # (bs, max_query_len, dim)
-        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + distilbert_output[1:]
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_gpt2.py b/server/transformers/src/transformers/modeling_tf_gpt2.py
deleted file mode 100644
index 11566609533684885c9c78c6738930ebc55ebfd7..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_gpt2.py
+++ /dev/null
@@ -1,694 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 OpenAI GPT-2 model. """
-
-
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import (
-    TFConv1D,
-    TFPreTrainedModel,
-    TFSequenceSummary,
-    TFSharedEmbeddings,
-    get_initializer,
-    shape_list,
-)
-
-
-logger = logging.getLogger(__name__)
-
-TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
-    "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
-    "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
-    "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",
-}
-
-
-def gelu(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-        x: float Tensor to perform activation.
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
-
-
-class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.n_ctx = n_ctx
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
-        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
-        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        pass
-
-    @staticmethod
-    def causal_attention_mask(nd, ns, dtype):
-        """1's in the lower triangle, counting from the lower right corner.
-        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
-        """
-        i = tf.range(nd)[:, None]
-        j = tf.range(ns)
-        m = i >= j - ns + nd
-        return tf.cast(m, dtype)
-
-    def _attn(self, inputs, training=False):
-        q, k, v, attention_mask, head_mask = inputs
-        # q, k, v have shape [batch, heads, sequence, features]
-        w = tf.matmul(q, k, transpose_b=True)
-        if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
-            w = w / tf.math.sqrt(dk)
-
-        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
-        _, _, nd, ns = shape_list(w)
-        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
-        b = tf.reshape(b, [1, 1, nd, ns])
-        w = w * b - 1e4 * (1 - b)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            w = w + attention_mask
-
-        w = tf.nn.softmax(w, axis=-1)
-        w = self.attn_dropout(w, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        outputs = [tf.matmul(w, v)]
-        if self.output_attentions:
-            outputs.append(w)
-        return outputs
-
-    def merge_heads(self, x):
-        x = tf.transpose(x, [0, 2, 1, 3])
-        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
-        return tf.reshape(x, new_x_shape)
-
-    def split_heads(self, x):
-        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
-        x = tf.reshape(x, new_x_shape)
-        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
-
-    def call(self, inputs, training=False):
-        x, layer_past, attention_mask, head_mask = inputs
-
-        x = self.c_attn(x)
-        query, key, value = tf.split(x, 3, axis=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key)
-        value = self.split_heads(value)
-        if layer_past is not None:
-            past_key, past_value = tf.unstack(layer_past, axis=1)
-            key = tf.concat([past_key, key], axis=-2)
-            value = tf.concat([past_value, value], axis=-2)
-        present = tf.stack([key, value], axis=1)
-
-        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
-        a = attn_outputs[0]
-
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a, training=training)
-
-        outputs = [a, present] + attn_outputs[1:]
-        return outputs  # a, present, (attentions)
-
-
-class TFMLP(tf.keras.layers.Layer):
-    def __init__(self, n_state, config, **kwargs):
-        super().__init__(**kwargs)
-        nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
-        self.act = gelu
-        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
-
-    def call(self, x, training=False):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        h2 = self.dropout(h2, training=training)
-        return h2
-
-
-class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
-        super().__init__(**kwargs)
-        nx = config.n_embd
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
-        self.mlp = TFMLP(4 * nx, config, name="mlp")
-
-    def call(self, inputs, training=False):
-        x, layer_past, attention_mask, head_mask = inputs
-
-        a = self.ln_1(x)
-        output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
-        a = output_attn[0]  # output_attn: a, present, (attentions)
-        x = x + a
-
-        m = self.ln_2(x)
-        m = self.mlp(m, training=training)
-        x = x + m
-
-        outputs = [x] + output_attn[1:]
-        return outputs  # x, present, (attentions)
-
-
-class TFGPT2MainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.output_hidden_states = config.output_hidden_states
-        self.output_attentions = config.output_attentions
-        self.num_hidden_layers = config.n_layer
-        self.vocab_size = config.vocab_size
-        self.n_embd = config.n_embd
-
-        self.wte = TFSharedEmbeddings(
-            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
-        )
-        self.wpe = tf.keras.layers.Embedding(
-            config.n_positions,
-            config.n_embd,
-            embeddings_initializer=get_initializer(config.initializer_range),
-            name="wpe",
-        )
-        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
-        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            past = inputs[1] if len(inputs) > 1 else past
-            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
-            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
-            position_ids = inputs[4] if len(inputs) > 4 else position_ids
-            head_mask = inputs[5] if len(inputs) > 5 else head_mask
-            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
-            assert len(inputs) <= 7, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            past = inputs.get("past", past)
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 7, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = shape_list(past[0][0])[-2]
-        if position_ids is None:
-            position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
-
-        if attention_mask is not None:
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-
-            attention_mask = tf.cast(attention_mask, tf.float32)
-            attention_mask = (1.0 - attention_mask) * -10000.0
-        else:
-            attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids, mode="embedding")
-        position_embeds = self.wpe(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode="embedding")
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states, training=training)
-
-        output_shape = input_shape + [shape_list(hidden_states)[-1]]
-
-        presents = ()
-        all_attentions = []
-        all_hidden_states = ()
-        for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
-
-            outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
-
-            hidden_states, present = outputs[:2]
-            presents = presents + (present,)
-
-            if self.output_attentions:
-                all_attentions.append(outputs[2])
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = tf.reshape(hidden_states, output_shape)
-        # Add last hidden state
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states, presents)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
-            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
-
-
-class TFGPT2PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = GPT2Config
-    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-GPT2_START_DOCSTRING = r"""
-
-    .. note::
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-GPT2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-    GPT2_START_DOCSTRING,
-)
-class TFGPT2Model(TFGPT2PreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2Model
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2Model.from_pretrained('gpt2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-@add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    GPT2_START_DOCSTRING,
-)
-class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFGPT2MainLayer(config, name="transformer")
-
-    def get_output_embeddings(self):
-        return self.transformer.wte
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.transformer.wte(hidden_states, mode="linear")
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-
-        return outputs  # lm_logits, presents, (all hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
-    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
-    The language modeling head has its weights tied to the input embeddings,
-    the classification head takes as input the input of a specified classification token index in the input sequence).
-""",
-    GPT2_START_DOCSTRING,
-)
-class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        config.num_labels = 1
-        self.transformer = TFGPT2MainLayer(config, name="transformer")
-        self.multiple_choice_head = TFSequenceSummary(
-            config, initializer_range=config.initializer_range, name="multiple_choice_head"
-        )
-
-    def get_output_embeddings(self):
-        return self.transformer.wte
-
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
-    def call(
-        self,
-        inputs,
-        past=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        mc_token_ids=None,
-        training=False,
-    ):
-        r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
-        lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
-            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-
-    Examples::
-
-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
-
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
-
-        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
-
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
-
-        """
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            past = inputs[1] if len(inputs) > 1 else past
-            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
-            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
-            position_ids = inputs[4] if len(inputs) > 4 else position_ids
-            head_mask = inputs[5] if len(inputs) > 5 else head_mask
-            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
-            mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
-            assert len(inputs) <= 8, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            past = inputs.get("past", past)
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
-            assert len(inputs) <= 8, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None:
-            input_shapes = shape_list(input_ids)
-        else:
-            input_shapes = shape_list(inputs_embeds)[:-1]
-
-        seq_length = input_shapes[-1]
-
-        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
-        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
-        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
-        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
-
-        flat_inputs = [
-            flat_input_ids,
-            past,
-            flat_attention_mask,
-            flat_token_type_ids,
-            flat_position_ids,
-            head_mask,
-            inputs_embeds,
-        ]
-
-        transformer_outputs = self.transformer(flat_inputs, training=training)
-        hidden_states = transformer_outputs[0]
-
-        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
-
-        lm_logits = self.transformer.wte(hidden_states, mode="linear")
-        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
-
-        mc_logits = tf.squeeze(mc_logits, axis=-1)
-
-        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
-
-        return outputs  # lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_openai.py b/server/transformers/src/transformers/modeling_tf_openai.py
deleted file mode 100644
index f04104db8352dfbd4f189572554b5a6c1cfa6b50..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_openai.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 OpenAI GPT model."""
-
-
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import (
-    TFConv1D,
-    TFPreTrainedModel,
-    TFSequenceSummary,
-    TFSharedEmbeddings,
-    get_initializer,
-    shape_list,
-)
-
-
-logger = logging.getLogger(__name__)
-
-TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
-}
-
-
-def gelu(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-        x: float Tensor to perform activation.
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
-
-
-def swish(x):
-    return x * tf.math.sigmoid(x)
-
-
-ACT_FNS = {
-    "gelu": tf.keras.layers.Activation(gelu),
-    "relu": tf.keras.activations.relu,
-    "swish": tf.keras.layers.Activation(swish),
-}
-
-
-class TFAttention(tf.keras.layers.Layer):
-    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.n_ctx = n_ctx
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
-        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
-        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
-        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        pass
-
-    @staticmethod
-    def causal_attention_mask(nd, ns, dtype):
-        """1's in the lower triangle, counting from the lower right corner.
-        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
-        """
-        i = tf.range(nd)[:, None]
-        j = tf.range(ns)
-        m = i >= j - ns + nd
-        return tf.cast(m, dtype)
-
-    def _attn(self, inputs, training=False):
-        q, k, v, attention_mask, head_mask = inputs
-        # q, k, v have shape [batch, heads, sequence, features]
-        w = tf.matmul(q, k, transpose_b=True)
-        if self.scale:
-            dk = tf.cast(shape_list(k)[-1], tf.float32)  # scale attention_scores
-            w = w / tf.math.sqrt(dk)
-
-        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
-        _, _, nd, ns = shape_list(w)
-        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
-        b = tf.reshape(b, [1, 1, nd, ns])
-        w = w * b - 1e4 * (1 - b)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            w = w + attention_mask
-
-        w = tf.nn.softmax(w, axis=-1)
-        w = self.attn_dropout(w, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        outputs = [tf.matmul(w, v)]
-        if self.output_attentions:
-            outputs.append(w)
-        return outputs
-
-    def merge_heads(self, x):
-        x = tf.transpose(x, [0, 2, 1, 3])
-        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
-        return tf.reshape(x, new_x_shape)
-
-    def split_heads(self, x):
-        x_shape = shape_list(x)
-        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
-        x = tf.reshape(x, new_x_shape)
-        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
-
-    def call(self, inputs, training=False):
-        x, attention_mask, head_mask = inputs
-
-        x = self.c_attn(x)
-        query, key, value = tf.split(x, 3, axis=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key)
-        value = self.split_heads(value)
-
-        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
-        a = attn_outputs[0]
-
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a, training=training)
-
-        outputs = [a] + attn_outputs[1:]
-        return outputs  # a, (attentions)
-
-
-class TFMLP(tf.keras.layers.Layer):
-    def __init__(self, n_state, config, **kwargs):
-        super().__init__(**kwargs)
-        nx = config.n_embd
-        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
-        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
-        self.act = gelu
-        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
-
-    def call(self, x, training=False):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        h2 = self.dropout(h2, training=training)
-        return h2
-
-
-class TFBlock(tf.keras.layers.Layer):
-    def __init__(self, n_ctx, config, scale=False, **kwargs):
-        super().__init__(**kwargs)
-        nx = config.n_embd
-        self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
-        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
-        self.mlp = TFMLP(4 * nx, config, name="mlp")
-        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
-
-    def call(self, inputs, training=False):
-        x, attention_mask, head_mask = inputs
-
-        output_attn = self.attn([x, attention_mask, head_mask], training=training)
-        a = output_attn[0]  # output_attn: a, (attentions)
-
-        n = self.ln_1(x + a)
-        m = self.mlp(n, training=training)
-        h = self.ln_2(n + m)
-
-        outputs = [h] + output_attn[1:]
-        return outputs  # x, (attentions)
-
-
-class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.output_hidden_states = config.output_hidden_states
-        self.output_attentions = config.output_attentions
-        self.num_hidden_layers = config.n_layer
-        self.vocab_size = config.vocab_size
-        self.n_embd = config.n_embd
-
-        self.tokens_embed = TFSharedEmbeddings(
-            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
-        )
-        self.positions_embed = tf.keras.layers.Embedding(
-            config.n_positions,
-            config.n_embd,
-            embeddings_initializer=get_initializer(config.initializer_range),
-            name="positions_embed",
-        )
-        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
-        self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
-
-    def get_input_embeddings(self):
-        return self.tokens_embed
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if position_ids is None:
-            position_ids = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :]
-
-        if attention_mask is not None:
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-
-            attention_mask = tf.cast(attention_mask, tf.float32)
-            attention_mask = (1.0 - attention_mask) * -10000.0
-        else:
-            attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
-
-        if inputs_embeds is None:
-            inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
-        position_embeds = self.positions_embed(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states, training=training)
-
-        output_shape = input_shape + [shape_list(hidden_states)[-1]]
-
-        all_attentions = []
-        all_hidden_states = ()
-        for i, block in enumerate(self.h):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
-
-            outputs = block([hidden_states, attention_mask, head_mask[i]], training=training)
-            hidden_states = outputs[0]
-            if self.output_attentions:
-                all_attentions.append(outputs[1])
-
-        hidden_states = tf.reshape(hidden_states, output_shape)
-        # Add last hidden state
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            # let the number of heads free (-1) so we can extract attention even after head pruning
-            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
-            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
-            outputs = outputs + (all_attentions,)
-        return outputs  # last hidden state, (all hidden_states), (attentions)
-
-
-class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = OpenAIGPTConfig
-    pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-OPENAI_GPT_START_DOCSTRING = r"""
-
-    .. note::
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-
-    Parameters:
-        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-OPENAI_GPT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-    OPENAI_GPT_START_DOCSTRING,
-)
-class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-@add_start_docstrings(
-    """OpenAI GPT Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    OPENAI_GPT_START_DOCSTRING,
-)
-class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
-
-    def get_output_embeddings(self):
-        return self.transformer.tokens_embed
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-
-        return outputs  # lm_logits, (all hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
-    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
-    The language modeling head has its weights tied to the input embeddings,
-    the classification head takes as input the input of a specified classification token index in the input sequence).
-""",
-    OPENAI_GPT_START_DOCSTRING,
-)
-class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        config.num_labels = 1
-        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
-        self.multiple_choice_head = TFSequenceSummary(
-            config, initializer_range=config.initializer_range, name="multiple_choice_head"
-        )
-
-    def get_output_embeddings(self):
-        return self.transformer.tokens_embed
-
-    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        mc_token_ids=None,
-        training=False,
-    ):
-        r"""
-        mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
-        lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
-            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
-        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-
-    Examples::
-
-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
-        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
-
-        """
-
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
-            assert len(inputs) <= 7, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
-            assert len(inputs) <= 7, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None:
-            input_shapes = shape_list(input_ids)
-        else:
-            input_shapes = shape_list(inputs_embeds)[:-1]
-
-        seq_length = input_shapes[-1]
-
-        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
-        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
-        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
-        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
-
-        flat_inputs = [
-            flat_input_ids,
-            flat_attention_mask,
-            flat_token_type_ids,
-            flat_position_ids,
-            head_mask,
-            inputs_embeds,
-        ]
-
-        transformer_outputs = self.transformer(flat_inputs, training=training)
-        hidden_states = transformer_outputs[0]
-
-        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
-
-        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
-        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
-
-        mc_logits = tf.squeeze(mc_logits, axis=-1)
-
-        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
-
-        return outputs  # lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_pytorch_utils.py b/server/transformers/src/transformers/modeling_tf_pytorch_utils.py
deleted file mode 100644
index 81290326c9beb0af3fd98f2bdd52b65974d13cd3..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_pytorch_utils.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch - TF 2.0 general utilities."""
-
-
-import logging
-import os
-import re
-
-import numpy
-
-
-logger = logging.getLogger(__name__)
-
-
-def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
-    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
-
-        Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
-            - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-            - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-
-        return tuple with:
-            - pytorch model weight name
-            - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
-    """
-    tf_name = tf_name.replace(":0", "")  # device ids
-    tf_name = re.sub(
-        r"/[^/]*___([^/]*)/", r"/\1/", tf_name
-    )  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
-    tf_name = tf_name.replace(
-        "_._", "/"
-    )  # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
-    tf_name = re.sub(r"//+", "/", tf_name)  # Remove empty levels at the end
-    tf_name = tf_name.split("/")  # Convert from TF2.0 '/' separators to PyTorch '.' separators
-    tf_name = tf_name[1:]  # Remove level zero
-
-    # When should we transpose the weights
-    transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name)
-
-    # Convert standard TF2.0 names in PyTorch names
-    if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
-        tf_name[-1] = "weight"
-    if tf_name[-1] == "beta":
-        tf_name[-1] = "bias"
-
-    # Remove prefix if needed
-    tf_name = ".".join(tf_name)
-    if start_prefix_to_remove:
-        tf_name = tf_name.replace(start_prefix_to_remove, "", 1)
-
-    return tf_name, transpose
-
-
-#####################
-# PyTorch => TF 2.0 #
-#####################
-
-
-def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
-    """
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    pt_path = os.path.abspath(pytorch_checkpoint_path)
-    logger.info("Loading PyTorch weights from {}".format(pt_path))
-
-    pt_state_dict = torch.load(pt_path, map_location="cpu")
-    logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
-
-    return load_pytorch_weights_in_tf2_model(
-        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
-    )
-
-
-def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch checkpoints in a TF 2.0 model
-    """
-    pt_state_dict = pt_model.state_dict()
-
-    return load_pytorch_weights_in_tf2_model(
-        tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys
-    )
-
-
-def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
-    """ Load pytorch state_dict in a TF 2.0 model.
-    """
-    try:
-        import torch  # noqa: F401
-        import tensorflow as tf  # noqa: F401
-        from tensorflow.python.keras import backend as K
-    except ImportError:
-        logger.error(
-            "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    if tf_inputs is None:
-        tf_inputs = tf_model.dummy_inputs
-
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure model is built
-
-    # Adapt state dict - TODO remove this and update the AWS weights files instead
-    # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
-    for key in pt_state_dict.keys():
-        new_key = None
-        if "gamma" in key:
-            new_key = key.replace("gamma", "weight")
-        if "beta" in key:
-            new_key = key.replace("beta", "bias")
-        if new_key:
-            old_keys.append(key)
-            new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-        pt_state_dict[new_key] = pt_state_dict.pop(old_key)
-
-    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
-    # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ""
-    if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
-        start_prefix_to_remove = tf_model.base_model_prefix + "."
-
-    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
-    tf_loaded_numel = 0
-    weight_value_tuples = []
-    all_pytorch_weights = set(list(pt_state_dict.keys()))
-    for symbolic_weight in symbolic_weights:
-        sw_name = symbolic_weight.name
-        name, transpose = convert_tf_weight_name_to_pt_weight_name(
-            sw_name, start_prefix_to_remove=start_prefix_to_remove
-        )
-
-        # Find associated numpy array in pytorch model state dict
-        if name not in pt_state_dict:
-            if allow_missing_keys:
-                continue
-            raise AttributeError("{} not found in PyTorch model".format(name))
-
-        array = pt_state_dict[name].numpy()
-
-        if transpose:
-            array = numpy.transpose(array)
-
-        if len(symbolic_weight.shape) < len(array.shape):
-            array = numpy.squeeze(array)
-        elif len(symbolic_weight.shape) > len(array.shape):
-            array = numpy.expand_dims(array, axis=0)
-
-        try:
-            assert list(symbolic_weight.shape) == list(array.shape)
-        except AssertionError as e:
-            e.args += (symbolic_weight.shape, array.shape)
-            raise e
-
-        tf_loaded_numel += array.size
-        # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
-
-        weight_value_tuples.append((symbolic_weight, array))
-        all_pytorch_weights.discard(name)
-
-    K.batch_set_value(weight_value_tuples)
-
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure restore ops are run
-
-    logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
-
-    logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
-
-    return tf_model
-
-
-#####################
-# TF 2.0 => PyTorch #
-#####################
-
-
-def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
-    """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
-        We use HDF5 to easily do transfer learning
-        (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
-    """
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    import transformers
-
-    logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
-
-    # Instantiate and load the associated TF 2.0 model
-    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
-    tf_model_class = getattr(transformers, tf_model_class_name)
-    tf_model = tf_model_class(pt_model.config)
-
-    if tf_inputs is None:
-        tf_inputs = tf_model.dummy_inputs
-
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure model is built
-
-    tf_model.load_weights(tf_checkpoint_path, by_name=True)
-
-    return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
-
-
-def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
-    """ Load TF 2.0 model in a pytorch model
-    """
-    weights = tf_model.weights
-
-    return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
-
-
-def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
-    """ Load TF2.0 symbolic weights in a PyTorch model
-    """
-    try:
-        import tensorflow as tf  # noqa: F401
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-
-    new_pt_params_dict = {}
-    current_pt_params_dict = dict(pt_model.named_parameters())
-
-    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
-    # TF models always have a prefix, some of PyTorch models (base ones) don't
-    start_prefix_to_remove = ""
-    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
-        start_prefix_to_remove = pt_model.base_model_prefix + "."
-
-    # Build a map from potential PyTorch weight names to TF 2.0 Variables
-    tf_weights_map = {}
-    for tf_weight in tf_weights:
-        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(
-            tf_weight.name, start_prefix_to_remove=start_prefix_to_remove
-        )
-        tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
-
-    all_tf_weights = set(list(tf_weights_map.keys()))
-    loaded_pt_weights_data_ptr = {}
-    missing_keys_pt = []
-    for pt_weight_name, pt_weight in current_pt_params_dict.items():
-        # Handle PyTorch shared weight ()not duplicated in TF 2.0
-        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
-            new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
-            continue
-
-        # Find associated numpy array in pytorch model state dict
-        if pt_weight_name not in tf_weights_map:
-            if allow_missing_keys:
-                missing_keys_pt.append(pt_weight_name)
-                continue
-            raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))
-
-        array, transpose = tf_weights_map[pt_weight_name]
-
-        if transpose:
-            array = numpy.transpose(array)
-
-        if len(pt_weight.shape) < len(array.shape):
-            array = numpy.squeeze(array)
-        elif len(pt_weight.shape) > len(array.shape):
-            array = numpy.expand_dims(array, axis=0)
-
-        try:
-            assert list(pt_weight.shape) == list(array.shape)
-        except AssertionError as e:
-            e.args += (pt_weight.shape, array.shape)
-            raise e
-
-        # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
-
-        new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
-        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
-        all_tf_weights.discard(pt_weight_name)
-
-    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
-    missing_keys += missing_keys_pt
-
-    if len(missing_keys) > 0:
-        logger.info(
-            "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys)
-        )
-    if len(unexpected_keys) > 0:
-        logger.info(
-            "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys)
-        )
-
-    logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
-
-    return pt_model
diff --git a/server/transformers/src/transformers/modeling_tf_roberta.py b/server/transformers/src/transformers/modeling_tf_roberta.py
deleted file mode 100644
index 31fb43f1cc6a5479d845f4fa2d242a124a70ccf2..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_roberta.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 RoBERTa model. """
-
-
-import logging
-
-import tensorflow as tf
-
-from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
-    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
-    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
-    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
-}
-
-
-class TFRobertaEmbeddings(TFBertEmbeddings):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
-
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-        self.padding_idx = 1
-
-    def create_position_ids_from_input_ids(self, x):
-        """ Replace non-padding symbols with their position numbers. Position numbers begin at
-        padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
-        `utils.make_positions`.
-        :param torch.Tensor x:
-        :return torch.Tensor:
-        """
-        mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
-        incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
-        return incremental_indicies + self.padding_idx
-
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """ We are provided embeddings directly. We cannot infer which are padded so just generate
-        sequential position ids.
-        :param torch.Tensor inputs_embeds:
-        :return torch.Tensor:
-        """
-        seq_length = shape_list(inputs_embeds)[1]
-
-        position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :]
-        return position_ids
-
-    def _embedding(self, inputs, training=False):
-        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
-
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = self.create_position_ids_from_input_ids(input_ids)
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
-
-        return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-
-
-class TFRobertaMainLayer(TFBertMainLayer):
-    """
-    Same as TFBertMainLayer but uses TFRobertaEmbeddings.
-    """
-
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-        self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-
-class TFRobertaPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = RobertaConfig
-    pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "roberta"
-
-
-ROBERTA_START_DOCSTRING = r"""
-    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
-    Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-ROBERTA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.RobertaTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`__
-        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`__
-        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
-            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
-            (if set to :obj:`False`) for evaluation.
-"""
-
-
-@add_start_docstrings(
-    "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
-    ROBERTA_START_DOCSTRING,
-)
-class TFRobertaModel(TFRobertaPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.roberta = TFRobertaMainLayer(config, name="roberta")
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Bert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaModel
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaModel.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        outputs = self.roberta(inputs, **kwargs)
-        return outputs
-
-
-class TFRobertaLMHead(tf.keras.layers.Layer):
-    """Roberta Head for masked language modeling."""
-
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
-        self.act = tf.keras.layers.Activation(gelu)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, features):
-        x = self.dense(features)
-        x = self.act(x)
-        x = self.layer_norm(x)
-
-        # project back to size of vocabulary with bias
-        x = self.decoder(x, mode="linear") + self.bias
-
-        return x
-
-
-@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
-class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.roberta = TFRobertaMainLayer(config, name="roberta")
-        self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
-
-    def get_output_embeddings(self):
-        return self.lm_head.decoder
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
-        """
-        outputs = self.roberta(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-        prediction_scores = self.lm_head(sequence_output)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        return outputs  # prediction_scores, (hidden_states), (attentions)
-
-
-class TFRobertaClassificationHead(tf.keras.layers.Layer):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size,
-            kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
-            name="dense",
-        )
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.out_proj = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
-        )
-
-    def call(self, features, training=False):
-        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x, training=training)
-        x = self.dense(x)
-        x = self.dropout(x, training=training)
-        x = self.out_proj(x)
-        return x
-
-
-@add_start_docstrings(
-    """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    ROBERTA_START_DOCSTRING,
-)
-class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.roberta = TFRobertaMainLayer(config, name="roberta")
-        self.classifier = TFRobertaClassificationHead(config, name="classifier")
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.constant([1])[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        outputs = self.roberta(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output, training=kwargs.get("training", False))
-
-        outputs = (logits,) + outputs[2:]
-
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """RoBERTa Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    ROBERTA_START_DOCSTRING,
-)
-class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.roberta = TFRobertaMainLayer(config, name="roberta")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            tuple of :obj:`tf.Tensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-
-        """
-        outputs = self.roberta(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # scores, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_t5.py b/server/transformers/src/transformers/modeling_tf_t5.py
deleted file mode 100644
index db62e784b10d6e771cd3fe1788535313f9367ea5..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_t5.py
+++ /dev/null
@@ -1,793 +0,0 @@
-# coding=utf-8
-# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 T5 model. """
-
-
-import copy
-import itertools
-import logging
-import math
-
-import tensorflow as tf
-
-from .configuration_t5 import T5Config
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
-    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
-    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
-    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
-    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
-}
-
-####################################################
-# TF 2.0 Models are constructed using Keras imperative API by sub-classing
-# - tf.keras.layers.Layer for the layers and
-# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
-####################################################
-
-
-class TFT5LayerNorm(tf.keras.layers.Layer):
-    def __init__(self, epsilon=1e-6, **kwargs):
-        """ Construct a layernorm module in the T5 style
-            No bias and no substraction of mean.
-        """
-        super().__init__(**kwargs)
-        self.variance_epsilon = epsilon
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
-        super().build(input_shape)
-
-    def call(self, x):
-        variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
-        x = x * tf.math.rsqrt(variance + self.variance_epsilon)
-        return self.weight * x
-
-
-class TFT5DenseReluDense(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
-        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-        self.act = tf.keras.activations.relu
-
-    def call(self, hidden_states, training=False):
-        h = self.wi(hidden_states)
-        h = self.act(h)
-        h = self.dropout(h, training=training)
-        h = self.wo(h)
-        return h
-
-
-class TFT5LayerFF(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-
-    def call(self, hidden_states, training=False):
-        norm_x = self.layer_norm(hidden_states)
-        y = self.DenseReluDense(norm_x, training=training)
-        layer_output = hidden_states + self.dropout(y, training=training)
-        return layer_output
-
-
-class TFT5Attention(tf.keras.layers.Layer):
-    NEW_ID = itertools.count()
-
-    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
-        super().__init__(**kwargs)
-        self.layer_id = next(TFT5Attention.NEW_ID)
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-
-        self.output_attentions = config.output_attentions
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.d_model = config.d_model
-        self.d_kv = config.d_kv
-        self.n_heads = config.num_heads
-        self.inner_dim = self.n_heads * self.d_kv
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q")
-        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k")
-        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v")
-        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = tf.keras.layers.Embedding(
-                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias"
-            )
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention.
-        The relative position is defined as memory_position - query_position, i.e.
-        the distance in tokens from the attending position to the attended-to
-        position.  If bidirectional=False, then positive relative positions are
-        invalid.
-        We use smaller buckets for small absolute relative_position and larger buckets
-        for larger absolute relative_positions.  All relative positions >=max_distance
-        map to the same bucket.  All relative positions <=-max_distance map to the
-        same bucket.  This should allow for more graceful generalization to longer
-        sequences than the model has been trained on.
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32
-            values in the range [0, num_buckets)
-        """
-        ret = 0
-        n = -relative_position
-        if bidirectional:
-            num_buckets //= 2
-            ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
-            n = tf.math.abs(n)
-        else:
-            n = tf.math.maximum(n, 0)
-        # now n is in the range [0, inf)
-        max_exact = num_buckets // 2
-        is_small = tf.math.less(n, max_exact)
-        val_if_large = max_exact + tf.dtypes.cast(
-            tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
-            / math.log(max_distance / max_exact)
-            * (num_buckets - max_exact),
-            tf.int32,
-        )
-        val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
-        ret += tf.where(is_small, n, val_if_large)
-        return ret
-
-    def compute_bias(self, qlen, klen):
-        """ Compute binned relative position bias """
-        context_position = tf.range(qlen)[:, None]
-        memory_position = tf.range(klen)[None, :]
-        relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(
-            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets
-        )
-        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
-        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
-        return values
-
-    def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
-        """
-        Self-attention (if kv is None) or attention over source sentence (provided by kv).
-        """
-        # Input is (bs, qlen, dim)
-        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = shape_list(input)
-        if kv is None:
-            klen = qlen if cache is None else cache["slen"] + qlen
-        else:
-            klen = shape_list(kv)[1]
-
-        def shape(x):
-            """  projection """
-            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
-
-        def unshape(x):
-            """  compute context """
-            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
-
-        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
-        if kv is None:
-            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
-        elif cache is None or self.layer_id not in cache:
-            k = v = kv
-            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)
-
-        if cache is not None:
-            if self.layer_id in cache:
-                if kv is None:
-                    k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
-                else:
-                    k, v = cache[self.layer_id]
-            cache[self.layer_id] = (k, v)
-
-        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
-        scores = tf.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)
-
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                raise ValueError("No position_bias provided and no weights to compute position_bias")
-            position_bias = self.compute_bias(qlen, klen)
-            if mask is not None:
-                position_bias = position_bias + mask
-                # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
-                # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
-
-        scores += position_bias
-        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, qlen, dim)
-
-        context = self.o(context)
-
-        outputs = (context,)
-        if self.output_attentions:
-            outputs = outputs + (weights,)
-        if self.has_relative_attention_bias:
-            outputs = outputs + (position_bias,)
-        return outputs
-
-
-class TFT5LayerSelfAttention(tf.keras.layers.Layer):
-    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
-        super().__init__(**kwargs)
-        self.SelfAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
-        )
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-
-    def call(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False):
-        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training
-        )
-        y = attention_output[0]
-        layer_output = hidden_states + self.dropout(y, training=training)
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class TFT5LayerCrossAttention(tf.keras.layers.Layer):
-    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
-        super().__init__(**kwargs)
-        self.EncDecAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
-        )
-        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-
-    def call(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False):
-        norm_x = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training
-        )
-        y = attention_output[0]
-        layer_output = hidden_states + self.dropout(y, training=training)
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class TFT5Block(tf.keras.layers.Layer):
-    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
-        super().__init__(**kwargs)
-        self.is_decoder = config.is_decoder
-        self.layer = []
-        self.layer.append(
-            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0")
-        )
-        if self.is_decoder:
-            self.layer.append(
-                TFT5LayerCrossAttention(
-                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1"
-                )
-            )
-            self.layer.append(TFT5LayerFF(config, name="layer_._2"))
-        else:
-            self.layer.append(TFT5LayerFF(config, name="layer_._1"))
-
-    def call(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        head_mask=None,
-        training=False,
-    ):
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            head_mask=head_mask,
-            training=training,
-        )
-        hidden_states = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]
-
-        if not self.is_decoder:
-            hidden_states = self.layer[1](hidden_states, training=training)
-        else:
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                kv=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask,
-                training=training,
-            )
-            hidden_states = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]
-            hidden_states = self.layer[2](hidden_states, training=training)
-
-        outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-
-
-####################################################
-# The full model without a specific pretrained or finetuning head is
-# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
-####################################################
-class TFT5MainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.is_decoder = config.is_decoder
-        self.config = config
-        self.num_hidden_layers = config.num_layers
-
-        self.block = [
-            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i))
-            for i in range(config.num_layers)
-        ]
-        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
-
-    def _prune_heads(self, heads_to_prune):
-        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
-
-    def call(
-        self,
-        hidden_states,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        training=False,
-    ):
-
-        batch_size, seq_length = shape_list(hidden_states)[:2]
-        if attention_mask is None:
-            attention_mask = tf.fill((batch_size, seq_length), 1)
-        if self.is_decoder and encoder_attention_mask is None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
-        num_dims_attention_mask = len(shape_list(attention_mask))
-        if num_dims_attention_mask == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif num_dims_attention_mask == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                seq_ids = tf.range(seq_length)
-                causal_mask = tf.less_equal(
-                    tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None]
-                )
-                causal_mask = tf.cast(causal_mask, dtype=tf.float32)
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
-        # extended_attention_mask = tf.math.equal(extended_attention_mask,
-        #                                         tf.transpose(extended_attention_mask, perm=(-1, -2)))
-
-        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
-
-        if self.is_decoder:
-            # If a 2D ou 3D attention mask is provided for the cross-attention
-            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
-            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
-            if num_dims_encoder_attention_mask == 3:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-            if num_dims_encoder_attention_mask == 2:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-
-            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
-            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
-            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
-            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
-
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        all_hidden_states = ()
-        all_attentions = ()
-        position_bias = None
-        encoder_decoder_position_bias = None
-        for i, layer_module in enumerate(self.block):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask[i],
-                training=training,
-            )
-            hidden_states = layer_outputs[0]
-            if i == 0:
-                # We share the position biases between the layers - the first layer store them
-                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-                position_bias = layer_outputs[2 if self.output_attentions else 1]
-                if self.is_decoder:
-                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-####################################################
-# TFT5PreTrainedModel is a sub-class of tf.keras.Model
-# which take care of loading and saving pretrained weights
-# and various common utilities.
-# Here you just need to specify a few (self-explanatory)
-# pointers for your model.
-####################################################
-class TFT5PreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = T5Config
-    pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-    @property
-    def dummy_inputs(self):
-        input_ids = tf.constant(DUMMY_INPUTS)
-        input_mask = tf.constant(DUMMY_MASK)
-        dummy_inputs = {
-            "decoder_input_ids": input_ids,
-            "encoder_input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-
-T5_START_DOCSTRING = r"""    The T5 model was proposed in
-    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
-    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
-    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
-
-    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
-        https://arxiv.org/abs/1910.10683
-
-    .. _`tf.keras.Model`:
-        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
-
-    Note on the model inputs:
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
-            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Inputs:
-        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-
-            T5 is a model with relative position embeddings so you should be able to pad the inputs on
-            the right or the left.
-
-            Indices can be obtained using :class:`transformers.T5Tokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-"""
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
-    T5_START_DOCSTRING,
-    T5_INPUTS_DOCSTRING,
-)
-class TFT5Model(TFT5PreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import T5Tokenizer, TFT5Model
-
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5Model.from_pretrained('t5-small')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
-
-        encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def get_output_embeddings(self):
-        return self.shared
-
-    def call(self, decoder_input_ids, **kwargs):
-        # We allow two types of multi-inputs:
-        # - traditional keyword arguments in the call method
-        # - all the arguments provided as a dict in the first positional argument of call
-        # The last option is useful to use the tf.keras fit() method.
-
-        if isinstance(decoder_input_ids, dict):
-            kwargs.update(decoder_input_ids)
-        else:
-            kwargs["decoder_input_ids"] = decoder_input_ids
-
-        kwargs_common = dict(
-            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
-        )
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
-
-        # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
-        if encoder_hidden_states is None:
-            # Convert encoder inputs in embeddings if needed
-            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
-            if hidden_states is None:
-                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
-
-            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
-        else:
-            encoder_outputs = ()
-
-        # Decode
-        # Convert decoder inputs in embeddings if needed
-        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
-        if hidden_states is None:
-            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-            hidden_states = self.shared(decoder_inputs_ids)
-
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
-        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
-
-        return decoder_outputs + encoder_outputs
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
-class TFT5WithLMHeadModel(TFT5PreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import T5Tokenizer, TFT5WithLMHeadModel
-
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5WithLMHeadModel.from_pretrained('t5-small')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids=input_ids)
-        prediction_scores = outputs[0]
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.model_dim = config.d_model
-
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
-
-        encoder_config = copy.deepcopy(config)
-        self.encoder = TFT5MainLayer(encoder_config, name="encoder")
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        self.decoder = TFT5MainLayer(decoder_config, name="decoder")
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def get_output_embeddings(self):
-        return self.shared
-
-    def call(self, decoder_input_ids, **kwargs):
-        # We allow two types of multi-inputs:
-        # - traditional keyword arguments in the call method
-        # - all the arguments provided as a dict in the first positional argument of call
-        # The last option is useful to use the tf.keras fit() method.
-
-        if isinstance(decoder_input_ids, dict):
-            kwargs.update(decoder_input_ids)
-        else:
-            kwargs["decoder_input_ids"] = decoder_input_ids
-
-        kwargs_common = dict(
-            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
-        )
-        kwargs_encoder = kwargs_common.copy()
-        kwargs_decoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
-
-        # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
-        if encoder_hidden_states is None:
-            # Convert encoder inputs in embeddings if needed
-            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
-            if hidden_states is None:
-                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
-
-            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
-        else:
-            encoder_outputs = ()
-
-        # Decode
-        # Convert decoder inputs in embeddings if needed
-        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
-        if hidden_states is None:
-            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-            hidden_states = self.shared(decoder_inputs_ids)
-
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
-        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
-
-        sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
-        lm_logits = self.shared(sequence_output, mode="linear")
-        decoder_outputs = (lm_logits,) + decoder_outputs[1:]
-
-        return decoder_outputs + encoder_outputs
diff --git a/server/transformers/src/transformers/modeling_tf_transfo_xl.py b/server/transformers/src/transformers/modeling_tf_transfo_xl.py
deleted file mode 100644
index 659685388e5d5ad4c5b252e77ede314897b6c83c..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_transfo_xl.py
+++ /dev/null
@@ -1,828 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 Transformer XL model.
-"""
-
-
-import logging
-
-import tensorflow as tf
-
-from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
-}
-
-
-class TFPositionalEmbedding(tf.keras.layers.Layer):
-    def __init__(self, demb, **kwargs):
-        super().__init__(**kwargs)
-
-        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
-
-    def call(self, pos_seq, bsz=None):
-        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
-        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
-
-        if bsz is not None:
-            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
-        else:
-            return pos_emb[:, None, :]
-
-
-class TFPositionwiseFF(tf.keras.layers.Layer):
-    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
-        super().__init__(**kwargs)
-
-        self.d_model = d_model
-        self.d_inner = d_inner
-        self.dropout = dropout
-
-        self.layer_1 = tf.keras.layers.Dense(
-            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
-        )
-        self.drop_1 = tf.keras.layers.Dropout(dropout)
-        self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
-        self.drop_2 = tf.keras.layers.Dropout(dropout)
-
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
-
-        self.pre_lnorm = pre_lnorm
-
-    def call(self, inp, training=False):
-        if self.pre_lnorm:
-            # layer normalization + positionwise feed-forward
-            core_out = self.layer_norm(inp)
-            core_out = self.layer_1(core_out)
-            core_out = self.drop_1(core_out, training=training)
-            core_out = self.layer_2(core_out)
-            core_out = self.drop_2(core_out, training=training)
-
-            # residual connection
-            output = core_out + inp
-        else:
-            # positionwise feed-forward
-            core_out = self.layer_1(inp)
-            core_out = self.drop_1(core_out, training=training)
-            core_out = self.layer_2(core_out)
-            core_out = self.drop_2(core_out, training=training)
-
-            # residual connection + layer normalization
-            output = self.layer_norm(inp + core_out)
-
-        return output
-
-
-class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
-    def __init__(
-        self,
-        n_head,
-        d_model,
-        d_head,
-        dropout,
-        dropatt=0,
-        tgt_len=None,
-        ext_len=None,
-        mem_len=None,
-        pre_lnorm=False,
-        r_r_bias=None,
-        r_w_bias=None,
-        output_attentions=False,
-        layer_norm_epsilon=1e-5,
-        init_std=0.02,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.output_attentions = output_attentions
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_head = d_head
-        self.dropout = dropout
-
-        self.qkv_net = tf.keras.layers.Dense(
-            3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
-        )
-
-        self.drop = tf.keras.layers.Dropout(dropout)
-        self.dropatt = tf.keras.layers.Dropout(dropatt)
-        self.o_net = tf.keras.layers.Dense(
-            d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
-        )
-
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
-
-        self.scale = 1 / (d_head ** 0.5)
-
-        self.pre_lnorm = pre_lnorm
-
-        if r_r_bias is not None and r_w_bias is not None:  # Biases are shared
-            self.r_r_bias = r_r_bias
-            self.r_w_bias = r_w_bias
-        else:
-            self.r_r_bias = None
-            self.r_w_bias = None
-
-        self.r_net = tf.keras.layers.Dense(
-            self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
-        )
-
-    def build(self, input_shape):
-        if self.r_r_bias is None or self.r_w_bias is None:  # Biases are not shared
-            self.r_r_bias = self.add_weight(
-                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
-            )
-            self.r_w_bias = self.add_weight(
-                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
-            )
-        super().build(input_shape)
-
-    def _rel_shift(self, x):
-        x_size = shape_list(x)
-
-        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
-        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
-        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
-        x = tf.reshape(x, x_size)
-
-        return x
-
-    def call(self, inputs, training=False):
-        w, r, attn_mask, mems, head_mask = inputs
-        qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
-
-        if mems is not None:
-            cat = tf.concat([mems, w], 0)
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(cat))
-            else:
-                w_heads = self.qkv_net(cat)
-            r_head_k = self.r_net(r)
-
-            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
-            w_head_q = w_head_q[-qlen:]
-        else:
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(w))
-            else:
-                w_heads = self.qkv_net(w)
-            r_head_k = self.r_net(r)
-
-            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
-
-        klen = shape_list(w_head_k)[0]
-
-        w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
-        w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
-        w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
-
-        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
-
-        # compute attention score
-        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
-        AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
-
-        rr_head_q = w_head_q + self.r_r_bias
-        BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k)  # qlen x klen x bsz x n_head
-        BD = self._rel_shift(BD)
-
-        # [qlen x klen x bsz x n_head]
-        attn_score = AC + BD
-        attn_score = attn_score * self.scale
-
-        # compute attention probability
-        if attn_mask is not None:
-            attn_mask_t = attn_mask[:, :, None, None]
-            attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
-
-        # [qlen x klen x bsz x n_head]
-        attn_prob = tf.nn.softmax(attn_score, axis=1)
-        attn_prob = self.dropatt(attn_prob, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_prob = attn_prob * head_mask
-
-        # compute attention vector
-        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
-
-        # [qlen x bsz x n_head x d_head]
-        attn_vec_sizes = shape_list(attn_vec)
-        attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
-
-        # linear projection
-        attn_out = self.o_net(attn_vec)
-        attn_out = self.drop(attn_out, training=training)
-
-        if self.pre_lnorm:
-            # residual connection
-            outputs = [w + attn_out]
-        else:
-            # residual connection + layer normalization
-            outputs = [self.layer_norm(w + attn_out)]
-
-        if self.output_attentions:
-            outputs.append(attn_prob)
-
-        return outputs
-
-
-class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
-    def __init__(
-        self,
-        n_head,
-        d_model,
-        d_head,
-        d_inner,
-        dropout,
-        tgt_len=None,
-        ext_len=None,
-        mem_len=None,
-        dropatt=0.0,
-        pre_lnorm=False,
-        r_w_bias=None,
-        r_r_bias=None,
-        output_attentions=False,
-        layer_norm_epsilon=1e-5,
-        init_std=0.02,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
-            n_head,
-            d_model,
-            d_head,
-            dropout,
-            tgt_len=tgt_len,
-            ext_len=ext_len,
-            mem_len=mem_len,
-            dropatt=dropatt,
-            pre_lnorm=pre_lnorm,
-            r_w_bias=r_w_bias,
-            r_r_bias=r_r_bias,
-            init_std=init_std,
-            output_attentions=output_attentions,
-            layer_norm_epsilon=layer_norm_epsilon,
-            name="dec_attn",
-        )
-        self.pos_ff = TFPositionwiseFF(
-            d_model,
-            d_inner,
-            dropout,
-            pre_lnorm=pre_lnorm,
-            init_std=init_std,
-            layer_norm_epsilon=layer_norm_epsilon,
-            name="pos_ff",
-        )
-
-    def call(self, inputs, training=False):
-        dec_inp, r, dec_attn_mask, mems, head_mask = inputs
-        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training)
-        ff_output = self.pos_ff(attn_outputs[0], training=training)
-
-        outputs = [ff_output] + attn_outputs[1:]
-
-        return outputs
-
-
-class TFAdaptiveEmbedding(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
-        super().__init__(**kwargs)
-
-        self.n_token = n_token
-        self.d_embed = d_embed
-        self.init_std = init_std
-
-        self.cutoffs = cutoffs + [n_token]
-        self.div_val = div_val
-        self.d_proj = d_proj
-
-        self.emb_scale = d_proj ** 0.5
-
-        self.cutoff_ends = [0] + self.cutoffs
-
-        self.emb_layers = []
-        self.emb_projs = []
-        if div_val == 1:
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-        else:
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(
-                    tf.keras.layers.Embedding(
-                        r_idx - l_idx,
-                        d_emb_i,
-                        embeddings_initializer=get_initializer(init_std),
-                        name="emb_layers_._{}".format(i),
-                    )
-                )
-
-    def build(self, input_shape):
-        for i in range(len(self.cutoffs)):
-            d_emb_i = self.d_embed // (self.div_val ** i)
-            self.emb_projs.append(
-                self.add_weight(
-                    shape=(d_emb_i, self.d_proj),
-                    initializer=get_initializer(self.init_std),
-                    trainable=True,
-                    name="emb_projs_._{}".format(i),
-                )
-            )
-        super().build(input_shape)
-
-    def call(self, inp):
-        if self.div_val == 1:
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-        else:
-            inp_flat = tf.reshape(inp, (-1,))
-            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-
-                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
-
-                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
-                emb_i = self.emb_layers[i](inp_i)
-                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
-
-                mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
-                emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
-
-            embed_shape = shape_list(inp) + [self.d_proj]
-            embed = tf.reshape(emb_flat, embed_shape)
-
-        embed *= self.emb_scale
-
-        return embed
-
-
-class TFTransfoXLMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.n_token = config.vocab_size
-
-        self.d_embed = config.d_embed
-        self.d_model = config.d_model
-        self.n_head = config.n_head
-        self.d_head = config.d_head
-        self.untie_r = config.untie_r
-
-        self.word_emb = TFAdaptiveEmbedding(
-            config.vocab_size,
-            config.d_embed,
-            config.d_model,
-            config.cutoffs,
-            div_val=config.div_val,
-            init_std=config.init_std,
-            name="word_emb",
-        )
-
-        self.drop = tf.keras.layers.Dropout(config.dropout)
-
-        self.n_layer = config.n_layer
-
-        self.tgt_len = config.tgt_len
-        self.mem_len = config.mem_len
-        self.ext_len = config.ext_len
-        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
-
-        self.attn_type = config.attn_type
-
-        self.layers = []
-        if config.attn_type == 0:  # the default attention
-            for i in range(config.n_layer):
-                self.layers.append(
-                    TFRelPartialLearnableDecoderLayer(
-                        config.n_head,
-                        config.d_model,
-                        config.d_head,
-                        config.d_inner,
-                        config.dropout,
-                        tgt_len=config.tgt_len,
-                        ext_len=config.ext_len,
-                        mem_len=config.mem_len,
-                        dropatt=config.dropatt,
-                        pre_lnorm=config.pre_lnorm,
-                        r_w_bias=None if self.untie_r else self.r_w_bias,
-                        r_r_bias=None if self.untie_r else self.r_r_bias,
-                        output_attentions=self.output_attentions,
-                        layer_norm_epsilon=config.layer_norm_epsilon,
-                        init_std=config.init_std,
-                        name="layers_._{}".format(i),
-                    )
-                )
-        else:  # learnable embeddings and absolute embeddings
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-
-        self.same_length = config.same_length
-        self.clamp_len = config.clamp_len
-
-        if self.attn_type == 0:  # default attention
-            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
-        else:  # learnable embeddings and absolute embeddings
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-
-    def build(self, input_shape):
-        if not self.untie_r:
-            self.r_w_bias = self.add_weight(
-                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
-            )
-            self.r_r_bias = self.add_weight(
-                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
-            )
-        super().build(input_shape)
-
-    def get_input_embeddings(self):
-        return self.word_emb
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        return self.word_emb
-
-    def backward_compatible(self):
-        self.sample_softmax = -1
-
-    def reset_length(self, tgt_len, ext_len, mem_len):
-        self.tgt_len = tgt_len
-        self.mem_len = mem_len
-        self.ext_len = ext_len
-
-    def _prune_heads(self, heads):
-        raise NotImplementedError
-
-    def init_mems(self, bsz):
-        if self.mem_len > 0:
-            mems = []
-            for i in range(self.n_layer):
-                empty = tf.zeros([self.mem_len, bsz, self.d_model])
-                mems.append(empty)
-
-            return mems
-        else:
-            return None
-
-    def _update_mems(self, hids, mems, qlen, mlen):
-        # does not deal with None
-        if mems is None:
-            return None
-
-        # mems is not None
-        assert len(hids) == len(mems), "len(hids) != len(mems)"
-
-        # There are `mlen + qlen` steps that can be cached into mems
-        # For the next step, the last `ext_len` of the `qlen` tokens
-        # will be used as the extended context. Hence, we only cache
-        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
-        # to `mlen + qlen - self.ext_len`.
-        new_mems = []
-        end_idx = mlen + max(0, qlen - 0 - self.ext_len)
-        beg_idx = max(0, end_idx - self.mem_len)
-        for i in range(len(hids)):
-
-            cat = tf.concat([mems[i], hids[i]], axis=0)
-            tf.stop_gradient(cat)
-            new_mems.append(cat[beg_idx:end_idx])
-
-        return new_mems
-
-    def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            mems = inputs[1] if len(inputs) > 1 else mems
-            head_mask = inputs[2] if len(inputs) > 2 else head_mask
-            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
-            assert len(inputs) <= 4, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            mems = inputs.get("mems", mems)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 4, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
-        # so we transpose here from shape [bsz, len] to shape [len, bsz]
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_ids = tf.transpose(input_ids, perm=(1, 0))
-            qlen, bsz = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
-            qlen, bsz = shape_list(inputs_embeds)[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if mems is None:
-            mems = self.init_mems(bsz)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
-        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.n_layer
-
-        if inputs_embeds is not None:
-            word_emb = inputs_embeds
-        else:
-            word_emb = self.word_emb(input_ids)
-
-        mlen = shape_list(mems[0])[0] if mems is not None else 0
-        klen = mlen + qlen
-
-        attn_mask = tf.ones([qlen, qlen])
-        mask_u = tf.linalg.band_part(attn_mask, 0, -1)
-        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
-        attn_mask_pad = tf.zeros([qlen, mlen])
-        dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
-        if self.same_length:
-            mask_l = tf.linalg.band_part(attn_mask, -1, 0)
-            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1)
-        # ::: PyTorch masking code for reference :::
-        # if self.same_length:
-        #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
-        #     mask_len = klen - self.mem_len
-        #     if mask_len > 0:
-        #         mask_shift_len = qlen - mask_len
-        #     else:
-        #         mask_shift_len = qlen
-        #     dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-        #             + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
-        # else:
-        #     dec_attn_mask = torch.triu(
-        #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
-
-        hids = []
-        attentions = []
-        if self.attn_type == 0:  # default
-            pos_seq = tf.range(klen - 1, -1, -1.0)
-            if self.clamp_len > 0:
-                pos_seq = tf.minimum(pos_seq, self.clamp_len)
-            pos_emb = self.pos_emb(pos_seq)
-
-            core_out = self.drop(word_emb, training=training)
-            pos_emb = self.drop(pos_emb, training=training)
-
-            for i, layer in enumerate(self.layers):
-                hids.append(core_out)
-                mems_i = None if mems is None else mems[i]
-                layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training)
-                core_out = layer_outputs[0]
-                if self.output_attentions:
-                    attentions.append(layer_outputs[1])
-        else:  # learnable embeddings and absolute embeddings
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-
-        core_out = self.drop(core_out, training=training)
-
-        new_mems = self._update_mems(hids, mems, mlen, qlen)
-
-        # We transpose back here to shape [bsz, len, hidden_dim]
-        outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
-        if self.output_hidden_states:
-            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
-            hids.append(core_out)
-            hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
-            outputs.append(hids)
-        if self.output_attentions:
-            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
-            attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
-            outputs.append(attentions)
-        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
-
-
-class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = TransfoXLConfig
-    pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-TRANSFO_XL_START_DOCSTRING = r"""
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-TRANSFO_XL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
-            given to this model should not be passed as input ids as they have already been computed.
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-    TRANSFO_XL_START_DOCSTRING,
-)
-class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
-        """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-@add_start_docstrings(
-    """The Transformer-XL Model with a language modeling head on top
-    (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING,
-)
-class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
-        self.sample_softmax = config.sample_softmax
-        # use sampled softmax
-        if config.sample_softmax > 0:
-            raise NotImplementedError
-        # use adaptive softmax (including standard softmax)
-        else:
-            self.crit = TFAdaptiveSoftmaxMask(
-                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
-            )
-
-    def reset_length(self, tgt_len, ext_len, mem_len):
-        self.transformer.reset_length(tgt_len, ext_len, mem_len)
-
-    def init_mems(self, bsz):
-        return self.transformer.init_mems(bsz)
-
-    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
-    def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
-        """
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            mems = inputs[1] if len(inputs) > 1 else mems
-            head_mask = inputs[2] if len(inputs) > 2 else head_mask
-            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
-            labels = inputs[4] if len(inputs) > 4 else labels
-            assert len(inputs) <= 5, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            mems = inputs.get("mems", mems)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            labels = inputs.get("labels", labels)
-            assert len(inputs) <= 5, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None:
-            bsz, tgt_len = shape_list(input_ids)[:2]
-        else:
-            bsz, tgt_len = shape_list(inputs_embeds)[:2]
-
-        transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training)
-
-        last_hidden = transformer_outputs[0]
-        pred_hid = last_hidden[:, -tgt_len:]
-        outputs = transformer_outputs[1:]
-        if self.sample_softmax > 0 and training:
-            raise NotImplementedError
-        else:
-            # pred_hid = tf.reshape(pred_hid, (-1, shape_list(pred_hid)[-1]))
-            softmax_output = self.crit([pred_hid, labels], training=training)
-            # softmax_output = tf.reshape(softmax_output, (bsz, tgt_len, -1))
-            outputs = [softmax_output] + outputs
-
-        return outputs  # logits, new_mems, (all hidden states), (all attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_transfo_xl_utilities.py b/server/transformers/src/transformers/modeling_tf_transfo_xl_utilities.py
deleted file mode 100644
index 1f6edf3a9b98d142bdb15788b9318d44a4727bed..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_transfo_xl_utilities.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" A TF 2.0 Adaptive Softmax for Transformer XL model.
-"""
-
-
-import tensorflow as tf
-
-from .modeling_tf_utils import shape_list
-
-
-class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.d_embed = d_embed
-        self.d_proj = d_proj
-
-        self.cutoffs = cutoffs + [vocab_size]
-        self.cutoff_ends = [0] + self.cutoffs
-        self.div_val = div_val
-
-        self.shortlist_size = self.cutoffs[0]
-        self.n_clusters = len(self.cutoffs) - 1
-        self.head_size = self.shortlist_size + self.n_clusters
-        self.keep_order = keep_order
-
-        self.out_layers = []
-        self.out_projs = []
-
-    def build(self, input_shape):
-        if self.n_clusters > 0:
-            self.cluster_weight = self.add_weight(
-                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
-            )
-            self.cluster_bias = self.add_weight(
-                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
-            )
-
-        if self.div_val == 1:
-            for i in range(len(self.cutoffs)):
-                if self.d_proj != self.d_embed:
-                    weight = self.add_weight(
-                        shape=(self.d_embed, self.d_proj),
-                        initializer="zeros",
-                        trainable=True,
-                        name="out_projs_._{}".format(i),
-                    )
-                    self.out_projs.append(weight)
-                else:
-                    self.out_projs.append(None)
-                weight = self.add_weight(
-                    shape=(self.vocab_size, self.d_embed,),
-                    initializer="zeros",
-                    trainable=True,
-                    name="out_layers_._{}_._weight".format(i),
-                )
-                bias = self.add_weight(
-                    shape=(self.vocab_size,),
-                    initializer="zeros",
-                    trainable=True,
-                    name="out_layers_._{}_._bias".format(i),
-                )
-                self.out_layers.append((weight, bias))
-        else:
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                d_emb_i = self.d_embed // (self.div_val ** i)
-
-                weight = self.add_weight(
-                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i)
-                )
-                self.out_projs.append(weight)
-                weight = self.add_weight(
-                    shape=(r_idx - l_idx, d_emb_i,),
-                    initializer="zeros",
-                    trainable=True,
-                    name="out_layers_._{}_._weight".format(i),
-                )
-                bias = self.add_weight(
-                    shape=(r_idx - l_idx,),
-                    initializer="zeros",
-                    trainable=True,
-                    name="out_layers_._{}_._bias".format(i),
-                )
-                self.out_layers.append((weight, bias))
-        super().build(input_shape)
-
-    @staticmethod
-    def _logit(x, W, b, proj=None):
-        y = x
-        if proj is not None:
-            y = tf.einsum("ibd,ed->ibe", y, proj)
-        return tf.einsum("ibd,nd->ibn", y, W) + b
-
-    @staticmethod
-    def _gather_logprob(logprob, target):
-        lp_size = shape_list(logprob)
-        r = tf.range(lp_size[0])
-        idx = tf.stack([r, target], 1)
-        return tf.gather_nd(logprob, idx)
-
-    def call(self, inputs, return_mean=True, training=False):
-        hidden, target = inputs
-        head_logprob = 0
-        if self.n_clusters == 0:
-            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
-            if target is not None:
-                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
-            out = tf.nn.log_softmax(output, axis=-1)
-        else:
-            hidden_sizes = shape_list(hidden)
-            out = []
-            loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                if target is not None:
-                    mask = (target >= l_idx) & (target < r_idx)
-                    mask_idx = tf.where(mask)
-                    cur_target = tf.boolean_mask(target, mask) - l_idx
-
-                if self.div_val == 1:
-                    cur_W = self.out_layers[0][0][l_idx:r_idx]
-                    cur_b = self.out_layers[0][1][l_idx:r_idx]
-                else:
-                    cur_W = self.out_layers[i][0]
-                    cur_b = self.out_layers[i][1]
-
-                if i == 0:
-                    cur_W = tf.concat([cur_W, self.cluster_weight], 0)
-                    cur_b = tf.concat([cur_b, self.cluster_bias], 0)
-
-                    head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
-                    head_logprob = tf.nn.log_softmax(head_logit)
-                    out.append(head_logprob[..., : self.cutoffs[0]])
-                    if target is not None:
-                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
-                        cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
-                else:
-                    tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
-                    tail_logprob = tf.nn.log_softmax(tail_logit)
-                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
-                    logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
-                    out.append(logprob_i)
-                    if target is not None:
-                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
-                        cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
-                        cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
-                        cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
-                if target is not None:
-                    loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
-            out = tf.concat(out, axis=-1)
-
-        if target is not None:
-            if return_mean:
-                loss = tf.reduce_mean(loss)
-            # Add the training-time loss value to the layer using `self.add_loss()`.
-            self.add_loss(loss)
-
-            # Log the loss as a metric (we could log arbitrary metrics,
-            # including different metrics for training and inference.
-            self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
-
-        return out
diff --git a/server/transformers/src/transformers/modeling_tf_utils.py b/server/transformers/src/transformers/modeling_tf_utils.py
deleted file mode 100644
index 4b64f9364ce4e97d66ad8cb49ebb19a9051ab5d1..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_utils.py
+++ /dev/null
@@ -1,602 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF general model utils."""
-
-
-import logging
-import os
-
-import h5py
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.keras.saving import hdf5_format
-
-from .configuration_utils import PretrainedConfig
-from .file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url
-from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
-
-
-logger = logging.getLogger(__name__)
-
-
-class TFModelUtilsMixin:
-    """
-    A few utilities for `tf.keras.Model`s, to be used as a mixin.
-    """
-
-    def num_parameters(self, only_trainable: bool = False) -> int:
-        """
-        Get number of (optionally, trainable) parameters in the model.
-        """
-        if only_trainable:
-            return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
-        else:
-            return self.count_params()
-
-
-class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
-    r""" Base class for all TF models.
-
-        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
-
-        Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
-            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
-
-                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
-                - ``path``: a path (string) to the TensorFlow checkpoint.
-
-            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
-    """
-    config_class = None
-    pretrained_model_archive_map = {}
-    base_model_prefix = ""
-
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(DUMMY_INPUTS)}
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config in model
-        self.config = config
-
-    def get_input_embeddings(self):
-        """
-        Returns the model's input embeddings.
-
-        Returns:
-            :obj:`tf.keras.layers.Layer`:
-                A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
-
-    def get_output_embeddings(self):
-        """
-        Returns the model's output embeddings.
-
-        Returns:
-            :obj:`tf.keras.layers.Layer`:
-                A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
-
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Variable from a provided token Embedding Module.
-            Increasing the size will add newly initialized vectors at the end
-            Reducing the size will remove vectors from the end
-
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``tf.Variable``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
-        """
-        # if new_num_tokens is None:
-        #     return old_embeddings
-
-        # old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        # if old_num_tokens == new_num_tokens:
-        #     return old_embeddings
-
-        # # Build new embeddings
-        # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        # new_embeddings.to(old_embeddings.weight.device)
-
-        # # initialize all new embeddings (in particular added tokens)
-        # self._init_weights(new_embeddings)
-
-        # # Copy word embeddings from the previous weights
-        # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
-
-        # return new_embeddings
-
-    def resize_token_embeddings(self, new_num_tokens=None):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
-
-        Arguments:
-
-            new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
-                If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
-
-        Return: ``tf.Variable``
-            Pointer to the input tokens Embeddings Module of the model
-        """
-        raise NotImplementedError
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the base model.
-
-            Arguments:
-
-                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
-        """
-        raise NotImplementedError
-
-    def save_pretrained(self, save_directory):
-        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # Save configuration file
-        self.config.save_pretrained(save_directory)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
-        self.save_weights(output_model_file)
-        logger.info("Model weights saved in {}".format(output_model_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
-
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with ``model.train()``
-
-        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
-        It is up to you to train those weights with a downstream fine-tuning task.
-
-        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) one of:
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
-                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            from_pt: (`optional`) boolean, default False:
-                Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_pt = kwargs.pop("from_pt", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            elif os.path.isdir(pretrained_model_name_or_path):
-                if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
-                    # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_pt` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
-                if from_pt:
-                    raise EnvironmentError(
-                        "Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name."
-                    )
-
-            # redirect to the cache, if necessary
-            try:
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                )
-            except EnvironmentError as e:
-                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    logger.error("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
-                else:
-                    logger.error(
-                        "Model name '{}' was not found in model name list ({}). "
-                        "We assumed '{}' was a path or url but couldn't find any file "
-                        "associated to this path or url.".format(
-                            pretrained_model_name_or_path,
-                            ", ".join(cls.pretrained_model_archive_map.keys()),
-                            archive_file,
-                        )
-                    )
-                raise e
-            if resolved_archive_file == archive_file:
-                logger.info("loading weights file {}".format(archive_file))
-            else:
-                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if from_pt:
-            # Load from a PyTorch checkpoint
-            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
-
-        model(model.dummy_inputs, training=False)  # build the network with dummy inputs
-
-        assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
-        # 'by_name' allow us to do transfer learning by skipping/adding layers
-        # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
-        try:
-            model.load_weights(resolved_archive_file, by_name=True)
-        except OSError:
-            raise OSError(
-                "Unable to load weights from h5 file. "
-                "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
-            )
-
-        model(model.dummy_inputs, training=False)  # Make sure restore ops are run
-
-        # Check if the models are the same to output loading informations
-        with h5py.File(resolved_archive_file, "r") as f:
-            if "layer_names" not in f.attrs and "model_weights" in f:
-                f = f["model_weights"]
-            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
-        model_layer_names = set(layer.name for layer in model.layers)
-        missing_keys = list(model_layer_names - hdf5_layer_names)
-        unexpected_keys = list(hdf5_layer_names - model_layer_names)
-        error_msgs = []
-
-        if len(missing_keys) > 0:
-            logger.info(
-                "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-            )
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
-        if output_loading_info:
-            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
-            return model, loading_info
-
-        return model
-
-
-class TFConv1D(tf.keras.layers.Layer):
-    def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
-        """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
-            Basically works like a Linear layer but the weights are transposed
-        """
-        super().__init__(**kwargs)
-        self.nf = nf
-        self.nx = nx
-        self.initializer_range = initializer_range
-
-    def build(self, input_shape):
-        self.weight = self.add_weight(
-            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
-        )
-        self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
-
-    def call(self, x):
-        bz, sl = shape_list(x)[:2]
-
-        x = tf.reshape(x, [-1, self.nx])
-        x = tf.matmul(x, self.weight) + self.bias
-
-        x = tf.reshape(x, [bz, sl, self.nf])
-
-        return x
-
-
-class TFSharedEmbeddings(tf.keras.layers.Layer):
-    """Construct shared token embeddings.
-    """
-
-    def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
-
-    def build(self, input_shape):
-        """Build shared word embedding layer
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        self.weight = self.add_weight(
-            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
-        )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding"):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
-
-    def _embedding(self, input_ids):
-        """Applies embedding based on inputs tensor."""
-        return tf.gather(self.weight, input_ids)
-
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [..., hidden_size]
-            Returns:
-                float32 tensor with shape [..., vocab_size].
-        """
-        first_dims = shape_list(inputs)[:-1]
-
-        x = tf.reshape(inputs, [-1, self.hidden_size])
-        logits = tf.matmul(x, self.weight, transpose_b=True)
-
-        return tf.reshape(logits, first_dims + [self.vocab_size])
-
-
-class TFSequenceSummary(tf.keras.layers.Layer):
-    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
-        Args of the config class:
-            summary_type:
-                - 'last' => [default] take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj: Add a projection after the vector extraction
-            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
-            summary_first_dropout: Add a dropout before the projection and activation
-            summary_last_dropout: Add a dropout after the projection and activation
-    """
-
-    def __init__(self, config, initializer_range=0.02, **kwargs):
-        super().__init__(**kwargs)
-
-        self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
-        if self.has_summary:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = tf.keras.layers.Dense(
-                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
-            )
-
-        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
-        if self.has_activation:
-            self.activation = tf.keras.activations.tanh
-
-        self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
-        if self.has_first_dropout:
-            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
-
-        self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
-        if self.has_last_dropout:
-            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
-
-    def call(self, inputs, training=False):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
-                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'cls_index' and cls_index is None:
-                    we take the last token of the sequence as classification token
-        """
-        if not isinstance(inputs, (dict, tuple, list)):
-            hidden_states = inputs
-            cls_index = None
-        elif isinstance(inputs, (tuple, list)):
-            hidden_states = inputs[0]
-            cls_index = inputs[1] if len(inputs) > 1 else None
-            assert len(inputs) <= 2, "Too many inputs."
-        else:
-            hidden_states = inputs.get("hidden_states")
-            cls_index = inputs.get("cls_index", None)
-
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = tf.reduce_mean(hidden_states, axis=1)
-        elif self.summary_type == "cls_index":
-            hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
-            if cls_index is None:
-                cls_index = tf.fill(
-                    hidden_shape[:-2], hidden_shape[-2] - 1
-                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
-            cls_shape = shape_list(cls_index)
-            if len(cls_shape) <= len(hidden_shape) - 2:
-                cls_index = cls_index[..., tf.newaxis]
-            # else:
-            # cls_index = cls_index[..., tf.newaxis]
-            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
-            output = tf.squeeze(
-                output, axis=len(hidden_shape) - 2
-            )  # shape of output: (batch, num choices, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        if self.has_first_dropout:
-            output = self.first_dropout(output, training=training)
-
-        if self.has_summary:
-            output = self.summary(output)
-
-        if self.has_activation:
-            output = self.activation(output)
-
-        if self.has_last_dropout:
-            output = self.last_dropout(output, training=training)
-
-        return output
-
-
-def shape_list(x):
-    """Deal with dynamic shape in tensorflow cleanly."""
-    static = x.shape.as_list()
-    dynamic = tf.shape(x)
-    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
-
-
-def get_initializer(initializer_range=0.02):
-    """Creates a `tf.initializers.truncated_normal` with the given range.
-    Args:
-        initializer_range: float, initializer range for stddev.
-    Returns:
-        TruncatedNormal initializer with stddev = `initializer_range`.
-    """
-    return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
diff --git a/server/transformers/src/transformers/modeling_tf_xlm.py b/server/transformers/src/transformers/modeling_tf_xlm.py
deleted file mode 100644
index 44b991d08cb2fafe0965fd3a4832dc5b57b723e8..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_xlm.py
+++ /dev/null
@@ -1,813 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 XLM model.
-"""
-
-
-import itertools
-import logging
-import math
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
-    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
-    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
-    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
-    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
-    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
-    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
-    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
-    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
-    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
-}
-
-
-def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
-    out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
-    out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
-
-
-def gelu(x):
-    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
-    return x * cdf
-
-
-def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
-    """
-    Generate hidden states mask, and optionally an attention mask.
-    """
-    bs = shape_list(lengths)[0]
-    if padding_mask is not None:
-        mask = padding_mask
-    else:
-        # assert lengths.max().item() <= slen
-        alen = tf.range(slen)
-        mask = tf.math.less(alen, lengths[:, tf.newaxis])
-
-    # attention mask is the same as mask, or triangular inferior attention (causal)
-    if causal:
-        attn_mask = tf.less_equal(
-            tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis]
-        )
-    else:
-        attn_mask = mask
-
-    # sanity check
-    # assert shape_list(mask) == [bs, slen]
-    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
-    assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
-
-    mask = tf.cast(mask, dtype=dtype)
-    attn_mask = tf.cast(attn_mask, dtype=dtype)
-
-    return mask, attn_mask
-
-
-class TFMultiHeadAttention(tf.keras.layers.Layer):
-
-    NEW_ID = itertools.count()
-
-    def __init__(self, n_heads, dim, config, **kwargs):
-        super().__init__(**kwargs)
-        self.layer_id = next(TFMultiHeadAttention.NEW_ID)
-        self.output_attentions = config.output_attentions
-        self.dim = dim
-        self.n_heads = n_heads
-        assert self.dim % self.n_heads == 0
-
-        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
-        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
-        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
-        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
-        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    def call(self, inputs, training=False):
-        """
-        Self-attention (if kv is None) or attention over source sentence (provided by kv).
-        """
-        input, mask, kv, cache, head_mask = inputs
-        # Input is (bs, qlen, dim)
-        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = shape_list(input)
-        if kv is None:
-            klen = qlen if cache is None else cache["slen"] + qlen
-        else:
-            klen = shape_list(kv)[1]
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        n_heads = self.n_heads
-        dim_per_head = self.dim // n_heads
-        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
-
-        def shape(x):
-            """  projection """
-            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
-
-        def unshape(x):
-            """  compute context """
-            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
-
-        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        if kv is None:
-            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        elif cache is None or self.layer_id not in cache:
-            k = v = kv
-            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
-
-        if cache is not None:
-            if self.layer_id in cache:
-                if kv is None:
-                    k_, v_ = cache[self.layer_id]
-                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
-                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
-                else:
-                    k, v = cache[self.layer_id]
-            cache[self.layer_id] = (k, v)
-
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
-        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
-        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
-        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
-        scores = scores - 1e30 * (1.0 - mask)
-
-        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
-        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, qlen, dim)
-
-        outputs = (self.out_lin(context),)
-        if self.output_attentions:
-            outputs = outputs + (weights,)
-        return outputs
-
-
-class TFTransformerFFN(tf.keras.layers.Layer):
-    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
-        super().__init__(**kwargs)
-        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
-        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
-        self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-
-    def call(self, input, training=False):
-        x = self.lin1(input)
-        x = self.act(x)
-        x = self.lin2(x)
-        x = self.dropout(x, training=training)
-        return x
-
-
-class TFXLMMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        # encoder / decoder, output layer
-        self.is_encoder = config.is_encoder
-        self.is_decoder = not config.is_encoder
-        if self.is_decoder:
-            raise NotImplementedError("Currently XLM can only be used as an encoder")
-        # self.with_output = with_output
-        self.causal = config.causal
-
-        # dictionary / languages
-        self.n_langs = config.n_langs
-        self.use_lang_emb = config.use_lang_emb
-        self.n_words = config.n_words
-        self.eos_index = config.eos_index
-        self.pad_index = config.pad_index
-        # self.dico = dico
-        # self.id2lang = config.id2lang
-        # self.lang2id = config.lang2id
-        # assert len(self.dico) == self.n_words
-        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
-
-        # model parameters
-        self.dim = config.emb_dim  # 512 by default
-        self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads  # 8 by default
-        self.n_layers = config.n_layers
-        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
-
-        # embeddings
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
-
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            self.dim,
-            embeddings_initializer=get_initializer(config.embed_init_std),
-            name="position_embeddings",
-        )
-        if config.sinusoidal_embeddings:
-            raise NotImplementedError
-            # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
-        if config.n_langs > 1 and config.use_lang_emb:
-            self.lang_embeddings = tf.keras.layers.Embedding(
-                self.n_langs,
-                self.dim,
-                embeddings_initializer=get_initializer(config.embed_init_std),
-                name="lang_embeddings",
-            )
-        self.embeddings = TFSharedEmbeddings(
-            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
-        )  # padding_idx=self.pad_index)
-        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
-
-        # transformer layers
-        self.attentions = []
-        self.layer_norm1 = []
-        self.ffns = []
-        self.layer_norm2 = []
-        # if self.is_decoder:
-        #     self.layer_norm15 = []
-        #     self.encoder_attn = []
-
-        for i in range(self.n_layers):
-            self.attentions.append(
-                TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i))
-            )
-            self.layer_norm1.append(
-                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i))
-            )
-            # if self.is_decoder:
-            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
-            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(
-                TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i))
-            )
-            self.layer_norm2.append(
-                tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i))
-            )
-
-        if hasattr(config, "pruned_heads"):
-            pruned_heads = config.pruned_heads.copy().items()
-            config.pruned_heads = {}
-            for layer, heads in pruned_heads:
-                if self.attentions[int(layer)].n_heads == config.n_heads:
-                    self.prune_heads({int(layer): list(map(int, heads))})
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):  # removed: src_enc=None, src_len=None
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            langs = inputs[2] if len(inputs) > 2 else langs
-            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
-            position_ids = inputs[4] if len(inputs) > 4 else position_ids
-            lengths = inputs[5] if len(inputs) > 5 else lengths
-            cache = inputs[6] if len(inputs) > 6 else cache
-            head_mask = inputs[7] if len(inputs) > 7 else head_mask
-            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
-            assert len(inputs) <= 9, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            langs = inputs.get("langs", langs)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            lengths = inputs.get("lengths", lengths)
-            cache = inputs.get("cache", cache)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 9, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            bs, slen = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            bs, slen = shape_list(inputs_embeds)[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if lengths is None:
-            if input_ids is not None:
-                lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
-            else:
-                lengths = tf.convert_to_tensor([slen] * bs, tf.int32)
-        # mask = input_ids != self.pad_index
-
-        # check inputs
-        # assert shape_list(lengths)[0] == bs
-        tf.debugging.assert_equal(shape_list(lengths)[0], bs)
-        # assert lengths.max().item() <= slen
-        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
-        # assert (src_enc is None) == (src_len is None)
-        # if src_enc is not None:
-        #     assert self.is_decoder
-        #     assert src_enc.size(0) == bs
-
-        # generate masks
-        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
-        # if self.is_decoder and src_enc is not None:
-        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
-
-        # position_ids
-        if position_ids is None:
-            position_ids = tf.expand_dims(tf.range(slen), axis=0)
-        else:
-            # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
-            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen])
-            # position_ids = position_ids.transpose(0, 1)
-
-        # langs
-        if langs is not None:
-            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
-            tf.debugging.assert_equal(shape_list(langs), [bs, slen])
-            # langs = langs.transpose(0, 1)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.n_layers
-
-        # do not recompute cached elements
-        if cache is not None and input_ids is not None:
-            _slen = slen - cache["slen"]
-            input_ids = input_ids[:, -_slen:]
-            position_ids = position_ids[:, -_slen:]
-            if langs is not None:
-                langs = langs[:, -_slen:]
-            mask = mask[:, -_slen:]
-            attn_mask = attn_mask[:, -_slen:]
-
-        # embeddings
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)
-
-        tensor = inputs_embeds + self.position_embeddings(position_ids)
-        if langs is not None and self.use_lang_emb:
-            tensor = tensor + self.lang_embeddings(langs)
-        if token_type_ids is not None:
-            tensor = tensor + self.embeddings(token_type_ids)
-        tensor = self.layer_norm_emb(tensor)
-        tensor = self.dropout(tensor, training=training)
-        tensor = tensor * mask[..., tf.newaxis]
-
-        # transformer layers
-        hidden_states = ()
-        attentions = ()
-        for i in range(self.n_layers):
-            if self.output_hidden_states:
-                hidden_states = hidden_states + (tensor,)
-
-            # self attention
-            attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
-            attn = attn_outputs[0]
-            if self.output_attentions:
-                attentions = attentions + (attn_outputs[1],)
-            attn = self.dropout(attn, training=training)
-            tensor = tensor + attn
-            tensor = self.layer_norm1[i](tensor)
-
-            # encoder attention (for decoder only)
-            # if self.is_decoder and src_enc is not None:
-            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
-            #     tensor = tensor + attn
-            #     tensor = self.layer_norm15[i](tensor)
-
-            # FFN
-            tensor = tensor + self.ffns[i](tensor)
-            tensor = self.layer_norm2[i](tensor)
-            tensor = tensor * mask[..., tf.newaxis]
-
-        # Add last hidden state
-        if self.output_hidden_states:
-            hidden_states = hidden_states + (tensor,)
-
-        # update cache length
-        if cache is not None:
-            cache["slen"] += tensor.size(1)
-
-        # move back sequence length to dimension 0
-        # tensor = tensor.transpose(0, 1)
-
-        outputs = (tensor,)
-        if self.output_hidden_states:
-            outputs = outputs + (hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (attentions,)
-        return outputs  # outputs, (hidden_states), (attentions)
-
-
-class TFXLMPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XLMConfig
-    pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-    @property
-    def dummy_inputs(self):
-        # Sometimes XLM has language embeddings so don't forget to build them as well if needed
-        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
-        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-        if self.config.use_lang_emb and self.config.n_langs > 1:
-            langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-        else:
-            langs_list = None
-        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
-
-
-XLM_START_DOCSTRING = r"""
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XLM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            A parallel sequence of tokens to be used to indicate the language of each token in the input.
-            Indices are languages ids which can be obtained from the language names by using two conversion mappings
-            provided in the configuration of the model (only provided for multilingual models).
-            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
-            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
-
-            See usage examples detailed in the `multilingual documentation <https://huggingface.co/transformers/multilingual.html>`__.
-        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Length of each sentence that can be used to avoid performing attention on padding token indices.
-            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
-            Indices selected in ``[0, ..., input_ids.size(-1)]``:
-        cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`):
-            dictionary with ``tf.Tensor`` that contains pre-computed
-            hidden-states (key and values in the attention blocks) as computed by the model
-            (see `cache` output below). Can be used to speed up sequential decoding.
-            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
-    XLM_START_DOCSTRING,
-)
-class TFXLMModel(TFXLMPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-class TFXLMPredLayer(tf.keras.layers.Layer):
-    """
-    Prediction layer (cross_entropy or adaptive_softmax).
-    """
-
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.asm = config.asm
-        self.n_words = config.n_words
-        self.pad_index = config.pad_index
-        if config.asm is False:
-            self.input_embeddings = input_embeddings
-        else:
-            raise NotImplementedError
-            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
-            #     in_features=dim,
-            #     n_classes=config.n_words,
-            #     cutoffs=config.asm_cutoffs,
-            #     div_value=config.asm_div_value,
-            #     head_bias=True,  # default is False
-            # )
-
-    def build(self, input_shape):
-        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
-        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-@add_start_docstrings(
-    """The XLM Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING,
-)
-class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name="transformer")
-        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
-
-    def get_output_embeddings(self):
-        return self.pred_layer.input_embeddings
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-
-        output = transformer_outputs[0]
-        outputs = self.pred_layer(output)
-        outputs = (outputs,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        return outputs
-
-
-@add_start_docstrings(
-    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING,
-)
-class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXLMMainLayer(config, name="transformer")
-        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForSequenceClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.constant([1])[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        output = transformer_outputs[0]
-
-        logits = self.sequence_summary(output)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-        return outputs
-
-
-@add_start_docstrings(
-    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING,
-)
-class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLMMainLayer(config, name="transformer")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
-        )
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = transformer_outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + transformer_outputs[
-            1:
-        ]  # Keep mems, hidden states, attentions if there are in it
-
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/src/transformers/modeling_tf_xlm_roberta.py b/server/transformers/src/transformers/modeling_tf_xlm_roberta.py
deleted file mode 100644
index 8b1efdb65df064a788105d045966020227dbb5ae..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_xlm_roberta.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# coding=utf-8
-# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0  XLM-RoBERTa model. """
-
-
-import logging
-
-from .configuration_xlm_roberta import XLMRobertaConfig
-from .file_utils import add_start_docstrings
-from .modeling_tf_roberta import (
-    TFRobertaForMaskedLM,
-    TFRobertaForSequenceClassification,
-    TFRobertaForTokenClassification,
-    TFRobertaModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
-
-
-XLM_ROBERTA_START_DOCSTRING = r"""
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class TFXLMRobertaModel(TFRobertaModel):
-    """
-    This class overrides :class:`~transformers.TFRobertaModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
-)
-class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
-    """
-    This class overrides :class:`~transformers.TFRobertaForMaskedLM`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class TFXLMRobertaForSequenceClassification(TFRobertaForSequenceClassification):
-    """
-    This class overrides :class:`~transformers.TFRobertaForSequenceClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
-    """
-    This class overrides :class:`~transformers.TFRobertaForTokenClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/server/transformers/src/transformers/modeling_tf_xlnet.py b/server/transformers/src/transformers/modeling_tf_xlnet.py
deleted file mode 100644
index d9ced75384c18de6508fbff5b7d3f6e13779404f..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_tf_xlnet.py
+++ /dev/null
@@ -1,1197 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 XLNet model.
-"""
-
-
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
-    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
-}
-
-
-def gelu(x):
-    """ Implementation of the gelu activation function.
-        XLNet is using OpenAI GPT's gelu
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
-
-
-def swish(x):
-    return x * tf.sigmoid(x)
-
-
-ACT2FN = {
-    "gelu": tf.keras.layers.Activation(gelu),
-    "relu": tf.keras.activations.relu,
-    "swish": tf.keras.layers.Activation(swish),
-}
-
-
-class TFXLNetRelativeAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-
-        if config.d_model % config.n_head != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head)
-            )
-
-        self.n_head = config.n_head
-        self.d_head = config.d_head
-        self.d_model = config.d_model
-        self.scale = 1 / (config.d_head ** 0.5)
-        self.initializer_range = config.initializer_range
-
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-
-    def build(self, input_shape):
-        initializer = get_initializer(self.initializer_range)
-        self.q = self.add_weight(
-            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
-        )
-        self.k = self.add_weight(
-            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
-        )
-        self.v = self.add_weight(
-            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
-        )
-        self.o = self.add_weight(
-            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
-        )
-        self.r = self.add_weight(
-            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
-        )
-        self.r_r_bias = self.add_weight(
-            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
-        )
-        self.r_s_bias = self.add_weight(
-            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
-        )
-        self.r_w_bias = self.add_weight(
-            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
-        )
-        self.seg_embed = self.add_weight(
-            shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
-        )
-        super().build(input_shape)
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    def rel_shift(self, x, klen=-1):
-        """perform relative shift to form the relative attention score."""
-        x_size = shape_list(x)
-
-        x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
-        x = x[1:, ...]
-        x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
-        x = x[:, 0:klen, :, :]
-        # x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
-
-        return x
-
-    def rel_attn_core(self, inputs, training=False):
-        """Core relative positional attention operations."""
-
-        q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
-
-        # content based attention score
-        ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)
-
-        # position based attention score
-        bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
-        bd = self.rel_shift(bd, klen=shape_list(ac)[1])
-
-        # segment based attention score
-        if seg_mat is None:
-            ef = 0
-        else:
-            ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
-            ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)
-
-        # merge attention scores and perform masking
-        attn_score = (ac + bd + ef) * self.scale
-        if attn_mask is not None:
-            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
-            if attn_mask.dtype == tf.float16:
-                attn_score = attn_score - 65500 * attn_mask
-            else:
-                attn_score = attn_score - 1e30 * attn_mask
-
-        # attention probability
-        attn_prob = tf.nn.softmax(attn_score, axis=1)
-
-        attn_prob = self.dropout(attn_prob, training=training)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_prob = attn_prob * head_mask
-
-        # attention output
-        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)
-
-        if self.output_attentions:
-            return attn_vec, attn_prob
-
-        return attn_vec
-
-    def post_attention(self, inputs, residual=True, training=False):
-        """Post-attention processing."""
-        # post-attention projection (back to `d_model`)
-        h, attn_vec = inputs
-
-        attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)
-
-        attn_out = self.dropout(attn_out, training=training)
-
-        if residual:
-            attn_out = attn_out + h
-        output = self.layer_norm(attn_out)
-
-        return output
-
-    def call(self, inputs, training=False):
-        (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs
-
-        if g is not None:
-            # Two-stream attention with relative positional encoding.
-            # content based attention score
-            if mems is not None and len(shape_list(mems)) > 1:
-                cat = tf.concat([mems, h], axis=0)
-            else:
-                cat = h
-
-            # content-based key head
-            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
-
-            # content-based value head
-            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
-
-            # position-based key head
-            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
-
-            # h-stream
-            # content-stream query head
-            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
-
-            # core attention ops
-            attn_vec_h = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
-            )
-
-            if self.output_attentions:
-                attn_vec_h, attn_prob_h = attn_vec_h
-
-            # post processing
-            output_h = self.post_attention([h, attn_vec_h], training=training)
-
-            # g-stream
-            # query-stream query head
-            q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q)
-
-            # core attention ops
-            if target_mapping is not None:
-                q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
-                attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
-                )
-
-                if self.output_attentions:
-                    attn_vec_g, attn_prob_g = attn_vec_g
-
-                attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
-            else:
-                attn_vec_g = self.rel_attn_core(
-                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training
-                )
-
-                if self.output_attentions:
-                    attn_vec_g, attn_prob_g = attn_vec_g
-
-            # post processing
-            output_g = self.post_attention([g, attn_vec_g], training=training)
-
-            if self.output_attentions:
-                attn_prob = attn_prob_h, attn_prob_g
-
-        else:
-            # Multi-head attention with relative positional encoding
-            if mems is not None and len(shape_list(mems)) > 1:
-                cat = tf.concat([mems, h], axis=0)
-            else:
-                cat = h
-
-            # content heads
-            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
-            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
-            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
-
-            # positional heads
-            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
-
-            # core attention ops
-            attn_vec = self.rel_attn_core(
-                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training
-            )
-
-            if self.output_attentions:
-                attn_vec, attn_prob = attn_vec
-
-            # post processing
-            output_h = self.post_attention([h, attn_vec], training=training)
-            output_g = None
-
-        outputs = (output_h, output_g)
-        if self.output_attentions:
-            outputs = outputs + (attn_prob,)
-        return outputs
-
-
-class TFXLNetFeedForward(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
-        self.layer_1 = tf.keras.layers.Dense(
-            config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
-        )
-        self.layer_2 = tf.keras.layers.Dense(
-            config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
-        )
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str):
-            self.activation_function = ACT2FN[config.ff_activation]
-        else:
-            self.activation_function = config.ff_activation
-
-    def call(self, inp, training=False):
-        output = inp
-        output = self.layer_1(output)
-        output = self.activation_function(output)
-        output = self.dropout(output, training=training)
-        output = self.layer_2(output)
-        output = self.dropout(output, training=training)
-        output = self.layer_norm(output + inp)
-        return output
-
-
-class TFXLNetLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
-        self.ff = TFXLNetFeedForward(config, name="ff")
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-
-    def call(self, inputs, training=False):
-        outputs = self.rel_attn(inputs, training=training)
-        output_h, output_g = outputs[:2]
-
-        if output_g is not None:
-            output_g = self.ff(output_g, training=training)
-        output_h = self.ff(output_h, training=training)
-
-        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
-        return outputs
-
-
-class TFXLNetLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
-
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
-
-
-class TFXLNetMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.output_past = config.output_past
-
-        self.mem_len = config.mem_len
-        self.reuse_len = config.reuse_len
-        self.d_model = config.d_model
-        self.same_length = config.same_length
-        self.attn_type = config.attn_type
-        self.bi_data = config.bi_data
-        self.clamp_len = config.clamp_len
-        self.n_layer = config.n_layer
-        self.use_bfloat16 = config.use_bfloat16
-        self.initializer_range = config.initializer_range
-
-        self.word_embedding = TFSharedEmbeddings(
-            config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
-        )
-        self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)]
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-
-    def get_input_embeddings(self):
-        return self.word_embedding
-
-    def build(self, input_shape):
-        initializer = get_initializer(self.initializer_range)
-        self.mask_emb = self.add_weight(
-            shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
-        )
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        raise NotImplementedError
-
-    def create_mask(self, qlen, mlen, dtype=tf.float32):
-        """
-        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
-
-        Args:
-            qlen: TODO Lysandre didn't fill
-            mlen: TODO Lysandre didn't fill
-
-        ::
-
-                  same_length=False:      same_length=True:
-                  <mlen > <  qlen >       <mlen > <  qlen >
-               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
-                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
-            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
-                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
-               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
-
-        """
-        attn_mask = tf.ones([qlen, qlen], dtype=dtype)
-        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
-        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
-        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
-        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
-        if self.same_length:
-            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
-            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
-        return ret
-
-    def cache_mem(self, curr_out, prev_mem):
-        """cache hidden states into memory."""
-        if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[: self.reuse_len]
-
-        if prev_mem is None:
-            new_mem = curr_out[-self.mem_len :]
-        else:
-            new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :]
-
-        return tf.stop_gradient(new_mem)
-
-    @staticmethod
-    def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
-        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
-        pos_emb = pos_emb[:, None, :]
-
-        if bsz is not None:
-            pos_emb = tf.tile(pos_emb, [1, bsz, 1])
-
-        return pos_emb
-
-    def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None):
-        """create relative positional encoding."""
-        freq_seq = tf.range(0, self.d_model, 2.0)
-        if dtype is not None and dtype != tf.float32:
-            freq_seq = tf.cast(freq_seq, dtype=dtype)
-        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
-
-        if self.attn_type == "bi":
-            # beg, end = klen - 1, -qlen
-            beg, end = klen, -qlen
-        elif self.attn_type == "uni":
-            # beg, end = klen - 1, -1
-            beg, end = klen, -1
-        else:
-            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
-
-        if self.bi_data:
-            fwd_pos_seq = tf.range(beg, end, -1.0)
-            bwd_pos_seq = tf.range(-beg, -end, 1.0)
-
-            if dtype is not None and dtype != tf.float32:
-                fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
-                bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype)
-
-            if self.clamp_len > 0:
-                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
-                bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
-
-            if bsz is not None:
-                # With bi_data, the batch size should be divisible by 2.
-                assert bsz % 2 == 0
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
-            else:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
-
-            pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
-        else:
-            fwd_pos_seq = tf.range(beg, end, -1.0)
-            if dtype is not None and dtype != tf.float32:
-                fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
-            if self.clamp_len > 0:
-                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
-            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
-
-        return pos_emb
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            mems = inputs[2] if len(inputs) > 2 else mems
-            perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
-            target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
-            token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
-            input_mask = inputs[6] if len(inputs) > 6 else input_mask
-            head_mask = inputs[7] if len(inputs) > 7 else head_mask
-            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
-            assert len(inputs) <= 9, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            mems = inputs.get("mems", mems)
-            perm_mask = inputs.get("perm_mask", perm_mask)
-            target_mapping = inputs.get("target_mapping", target_mapping)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            input_mask = inputs.get("input_mask", input_mask)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 9, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
-        # but we want a unified interface in the library with the batch size on the first dimension
-        # so we move here the first dimension (batch) to the end
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_ids = tf.transpose(input_ids, perm=(1, 0))
-            qlen, bsz = shape_list(input_ids)[:2]
-        elif inputs_embeds is not None:
-            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
-            qlen, bsz = shape_list(inputs_embeds)[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
-        input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
-        attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
-        perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
-        target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
-
-        mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
-        klen = mlen + qlen
-
-        dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32
-
-        # Attention mask
-        # causal attention mask
-        if self.attn_type == "uni":
-            attn_mask = self.create_mask(qlen, mlen)
-            attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == "bi":
-            attn_mask = None
-        else:
-            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
-
-        # data mask: input mask & perm mask
-        assert input_mask is None or attention_mask is None, (
-            "You can only use one of input_mask (uses 1 for padding) "
-            "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
-        )
-        if input_mask is None and attention_mask is not None:
-            input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
-        if input_mask is not None and perm_mask is not None:
-            data_mask = input_mask[None] + perm_mask
-        elif input_mask is not None and perm_mask is None:
-            data_mask = input_mask[None]
-        elif input_mask is None and perm_mask is not None:
-            data_mask = perm_mask
-        else:
-            data_mask = None
-
-        if data_mask is not None:
-            # all mems can be attended to
-            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float)
-            data_mask = tf.concat([mems_mask, data_mask], axis=1)
-            if attn_mask is None:
-                attn_mask = data_mask[:, :, :, None]
-            else:
-                attn_mask += data_mask[:, :, :, None]
-
-        if attn_mask is not None:
-            attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float)
-
-        if attn_mask is not None:
-            non_tgt_mask = -tf.eye(qlen, dtype=dtype_float)
-            non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1)
-            non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float)
-        else:
-            non_tgt_mask = None
-
-        # Word embeddings and prepare h & g hidden states
-        if inputs_embeds is not None:
-            word_emb_k = inputs_embeds
-        else:
-            word_emb_k = self.word_embedding(input_ids)
-        output_h = self.dropout(word_emb_k, training=training)
-        if target_mapping is not None:
-            word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
-            # else:  # We removed the inp_q input which was same as target mapping
-            #     inp_q_ext = inp_q[:, :, None]
-            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
-            output_g = self.dropout(word_emb_q, training=training)
-        else:
-            output_g = None
-
-        # Segment embedding
-        if token_type_ids is not None:
-            # Convert `token_type_ids` to one-hot `seg_mat`
-            mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
-            cat_ids = tf.concat([mem_pad, token_type_ids], 0)
-
-            # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32)
-            seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
-        else:
-            seg_mat = None
-
-        # Positional encoding
-        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
-        pos_emb = self.dropout(pos_emb, training=training)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
-        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
-                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.n_layer
-
-        new_mems = ()
-        if mems is None:
-            mems = [None] * len(self.layer)
-
-        attentions = []
-        hidden_states = []
-        for i, layer_module in enumerate(self.layer):
-            # cache new mems
-            if self.mem_len is not None and self.mem_len > 0 and self.output_past:
-                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
-            if self.output_hidden_states:
-                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
-
-            outputs = layer_module(
-                [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]],
-                training=training,
-            )
-            output_h, output_g = outputs[:2]
-            if self.output_attentions:
-                attentions.append(outputs[2])
-
-        # Add last hidden state
-        if self.output_hidden_states:
-            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
-
-        output = self.dropout(output_g if output_g is not None else output_h, training=training)
-
-        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
-        outputs = (tf.transpose(output, perm=(1, 0, 2)),)
-
-        if self.mem_len is not None and self.mem_len > 0 and self.output_past:
-            outputs = outputs + (new_mems,)
-
-        if self.output_hidden_states:
-            if output_g is not None:
-                hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
-            else:
-                hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
-            outputs = outputs + (hidden_states,)
-        if self.output_attentions:
-            attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
-            outputs = outputs + (attentions,)
-
-        return outputs  # outputs, (new_mems), (hidden_states), (attentions)
-
-
-class TFXLNetPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XLNetConfig
-    pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-XLNET_START_DOCSTRING = r"""
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
-        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
-        in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XLNET_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
-            given to this model should not be passed as input ids as they have already been computed.
-        perm_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
-            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
-            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
-            If None, each token attends to all the others (full bidirectional attention).
-            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
-        target_mapping (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to indicate the output tokens to use.
-            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
-            Only used during pretraining for partial prediction or for sequential decoding (generation).
-        token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        input_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
-            Kept for compatibility with the original code base.
-            You can only uses one of `input_mask` and `attention_mask`
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
-        head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
-    XLNET_START_DOCSTRING,
-)
-class TFXLNetModel(TFXLNetPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name="transformer")
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetModel
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-@add_start_docstrings(
-    """XLNet Model with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING,
-)
-class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name="transformer")
-        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
-
-    def get_output_embeddings(self):
-        return self.lm_loss.input_embeddings
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        import numpy as np
-        from transformers import XLNetTokenizer, TFXLNetLMHeadModel
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
-
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
-        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
-
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        hidden_state = transformer_outputs[0]
-        logits = self.lm_loss(hidden_state)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        return outputs  # return logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING,
-)
-class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXLNetMainLayer(config, name="transformer")
-        self.sequence_summary = TFSequenceSummary(
-            config, initializer_range=config.initializer_range, name="sequence_summary"
-        )
-        self.logits_proj = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
-        )
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        output = transformer_outputs[0]
-
-        output = self.sequence_summary(output)
-        logits = self.logits_proj(output)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        return outputs  # return logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLNET_START_DOCSTRING,
-)
-class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXLNetMainLayer(config, name="transformer")
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    def call(self, inputs, **kwargs):
-        r"""
-    Return:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
-            Classification scores (before SoftMax).
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForTokenClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-        output = transformer_outputs[0]
-
-        logits = self.classifier(output)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        return outputs  # return logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING,
-)
-class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXLNetMainLayer(config, name="transformer")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-        )
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
-    Returns:
-        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = transformer_outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + transformer_outputs[
-            1:
-        ]  # Keep mems, hidden states, attentions if there are in it
-
-        return outputs  # start_logits, end_logits, (mems), (hidden_states), (attentions)
-
-
-# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-#     the hidden-states output to compute `span start logits` and `span end logits`). """,
-#     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
-# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
-#     r"""
-#     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-#         **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)``
-#             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-#         **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)``
-#             Indices for the top config.start_n_top start token possibilities (beam-search).
-#         **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-#             Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-#         **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-#             Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-#         **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-#             ``tf.Tensor`` of shape ``(batch_size,)``
-#             Log probabilities for the ``is_impossible`` label of the answers.
-#         **mems**:
-#             list of ``tf.Tensor`` (one for each layer):
-#             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-#             if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
-#             See details in the docstring of the `mems` input above.
-#         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-#             list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-#             of shape ``(batch_size, sequence_length, hidden_size)``:
-#             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-#         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-#             list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-#             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-#     Examples::
-
-#         # For example purposes. Not runnable.
-#         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-#         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-#         start_positions = tf.constant([1])
-#         end_positions = tf.constant([3])
-#         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-#         loss, start_scores, end_scores = outputs[:2]
-
-#     """
-#     def __init__(self, config, *inputs, **kwargs):
-#         super().__init__(config, *inputs, **kwargs)
-#         self.start_n_top = config.start_n_top
-#         self.end_n_top = config.end_n_top
-
-#         self.transformer = TFXLNetMainLayer(config, name='transformer')
-#         self.start_logits = TFPoolerStartLogits(config, name='start_logits')
-#         self.end_logits = TFPoolerEndLogits(config, name='end_logits')
-#         self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
-
-#     def call(self, inputs, training=False):
-#         transformer_outputs = self.transformer(inputs, training=training)
-#         hidden_states = transformer_outputs[0]
-#         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-#         outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-#         if start_positions is not None and end_positions is not None:
-#             # If we are on multi-GPU, let's remove the dimension added by batch splitting
-#             for x in (start_positions, end_positions, cls_index, is_impossible):
-#                 if x is not None and x.dim() > 1:
-#                     x.squeeze_(-1)
-
-#             # during training, compute the end logits based on the ground truth of the start position
-#             end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-#             loss_fct = CrossEntropyLoss()
-#             start_loss = loss_fct(start_logits, start_positions)
-#             end_loss = loss_fct(end_logits, end_positions)
-#             total_loss = (start_loss + end_loss) / 2
-
-#             if cls_index is not None and is_impossible is not None:
-#                 # Predict answerability from the representation of CLS and START
-#                 cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-#                 loss_fct_cls = nn.BCEWithLogitsLoss()
-#                 cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-#                 # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-#                 total_loss += cls_loss * 0.5
-
-#             outputs = (total_loss,) + outputs
-
-#         else:
-#             # during inference, compute the end logits based on beam search
-#             bsz, slen, hsz = hidden_states.size()
-#             start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
-
-#             start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-#             start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-#             start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
-#             start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
-
-#             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
-#             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-#             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-#             end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
-
-#             end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
-#             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-#             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-#             start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)  # get the representation of START as weighted sum of hidden states
-#             cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
-
-#             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
-
-#         # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-#         # or (if labels are provided) (total_loss,)
-#         return outputs
diff --git a/server/transformers/src/transformers/modeling_transfo_xl.py b/server/transformers/src/transformers/modeling_transfo_xl.py
deleted file mode 100644
index 05bb5f7e3eb29c23d9fc1d03c352491177afa3e3..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_transfo_xl.py
+++ /dev/null
@@ -1,945 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Transformer XL model.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
-"""
-
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_transfo_xl_utilities import LogUniformSampler, ProjectedAdaptiveLogSoftmax, sample_logits
-from .modeling_utils import PreTrainedModel
-
-
-logger = logging.getLogger(__name__)
-
-TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
-}
-
-
-def build_tf_to_pytorch_map(model, config):
-    """ A map of modules from TF to PyTorch.
-        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
-    """
-    tf_to_pt_map = {}
-
-    if hasattr(model, "transformer"):
-        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
-        tf_to_pt_map.update(
-            {
-                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
-                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
-            }
-        )
-        for i, (out_l, proj_l, tie_proj) in enumerate(
-            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
-        ):
-            layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
-            if config.tie_weight:
-                tf_to_pt_map.update({layer_str + "b": out_l.bias})
-            else:
-                raise NotImplementedError
-                # I don't think this is implemented in the TF code
-                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
-            if not tie_proj:
-                tf_to_pt_map.update({layer_str + "proj": proj_l})
-        # Now load the rest of the transformer
-        model = model.transformer
-
-    # Embeddings
-    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
-        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
-        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
-
-    # Transformer blocks
-    for i, b in enumerate(model.layers):
-        layer_str = "transformer/layer_%d/" % i
-        tf_to_pt_map.update(
-            {
-                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
-                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
-                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
-                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
-                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
-                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
-                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
-                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
-                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
-                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
-                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
-            }
-        )
-
-    # Relative positioning biases
-    if config.untie_r:
-        r_r_list = []
-        r_w_list = []
-        for b in model.layers:
-            r_r_list.append(b.dec_attn.r_r_bias)
-            r_w_list.append(b.dec_attn.r_w_bias)
-    else:
-        r_r_list = [model.r_r_bias]
-        r_w_list = [model.r_w_bias]
-    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
-    return tf_to_pt_map
-
-
-def load_tf_weights_in_transfo_xl(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
-
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        tf_weights[name] = array
-
-    for name, pointer in tf_to_pt_map.items():
-        assert name in tf_weights
-        array = tf_weights[name]
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if "kernel" in name or "proj" in name:
-            array = np.transpose(array)
-        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
-            # Here we will split the TF weigths
-            assert len(pointer) == array.shape[0]
-            for i, p_i in enumerate(pointer):
-                arr_i = array[i, ...]
-                try:
-                    assert p_i.shape == arr_i.shape
-                except AssertionError as e:
-                    e.args += (p_i.shape, arr_i.shape)
-                    raise
-                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
-                p_i.data = torch.from_numpy(arr_i)
-        else:
-            try:
-                assert pointer.shape == array.shape
-            except AssertionError as e:
-                e.args += (pointer.shape, array.shape)
-                raise
-            logger.info("Initialize PyTorch weight {}".format(name))
-            pointer.data = torch.from_numpy(array)
-        tf_weights.pop(name, None)
-        tf_weights.pop(name + "/Adam", None)
-        tf_weights.pop(name + "/Adam_1", None)
-
-    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
-    return model
-
-
-class PositionalEmbedding(nn.Module):
-    def __init__(self, demb):
-        super().__init__()
-
-        self.demb = demb
-
-        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
-        self.register_buffer("inv_freq", inv_freq)
-
-    def forward(self, pos_seq, bsz=None):
-        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
-        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
-
-        if bsz is not None:
-            return pos_emb[:, None, :].expand(-1, bsz, -1)
-        else:
-            return pos_emb[:, None, :]
-
-
-class PositionwiseFF(nn.Module):
-    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
-        super().__init__()
-
-        self.d_model = d_model
-        self.d_inner = d_inner
-        self.dropout = dropout
-
-        self.CoreNet = nn.Sequential(
-            nn.Linear(d_model, d_inner),
-            nn.ReLU(inplace=True),
-            nn.Dropout(dropout),
-            nn.Linear(d_inner, d_model),
-            nn.Dropout(dropout),
-        )
-
-        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
-
-        self.pre_lnorm = pre_lnorm
-
-    def forward(self, inp):
-        if self.pre_lnorm:
-            # layer normalization + positionwise feed-forward
-            core_out = self.CoreNet(self.layer_norm(inp))
-
-            # residual connection
-            output = core_out + inp
-        else:
-            # positionwise feed-forward
-            core_out = self.CoreNet(inp)
-
-            # residual connection + layer normalization
-            output = self.layer_norm(inp + core_out)
-
-        return output
-
-
-class RelPartialLearnableMultiHeadAttn(nn.Module):
-    def __init__(
-        self,
-        n_head,
-        d_model,
-        d_head,
-        dropout,
-        dropatt=0,
-        tgt_len=None,
-        ext_len=None,
-        mem_len=None,
-        pre_lnorm=False,
-        r_r_bias=None,
-        r_w_bias=None,
-        output_attentions=False,
-        layer_norm_epsilon=1e-5,
-    ):
-        super().__init__()
-
-        self.output_attentions = output_attentions
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_head = d_head
-        self.dropout = dropout
-
-        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
-
-        self.drop = nn.Dropout(dropout)
-        self.dropatt = nn.Dropout(dropatt)
-        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
-
-        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
-
-        self.scale = 1 / (d_head ** 0.5)
-
-        self.pre_lnorm = pre_lnorm
-
-        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
-            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-        else:
-            self.r_r_bias = r_r_bias
-            self.r_w_bias = r_w_bias
-
-        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
-
-    def _rel_shift(self, x):
-        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
-        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
-        x_padded = torch.cat([zero_pad, x], dim=1)
-
-        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
-        x_padded = x_padded.view(*x_padded_shape)
-
-        x = x_padded[1:].view_as(x)
-
-        return x
-
-    def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
-        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
-
-        if mems is not None:
-            cat = torch.cat([mems, w], 0)
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(cat))
-            else:
-                w_heads = self.qkv_net(cat)
-            r_head_k = self.r_net(r)
-
-            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
-            w_head_q = w_head_q[-qlen:]
-        else:
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(w))
-            else:
-                w_heads = self.qkv_net(w)
-            r_head_k = self.r_net(r)
-
-            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
-
-        klen = w_head_k.size(0)
-
-        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
-        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
-        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
-
-        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
-
-        # compute attention score
-        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
-        AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
-
-        rr_head_q = w_head_q + self.r_r_bias
-        BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k))  # qlen x klen x bsz x n_head
-        BD = self._rel_shift(BD)
-
-        # [qlen x klen x bsz x n_head]
-        attn_score = AC + BD
-        attn_score.mul_(self.scale)
-
-        # compute attention probability
-        if attn_mask is not None and torch.sum(attn_mask).item():
-            attn_mask = attn_mask == 1  # Switch to bool
-            if attn_mask.dim() == 2:
-                if next(self.parameters()).dtype == torch.float16:
-                    attn_score = (
-                        attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score)
-                    )
-                else:
-                    attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score)
-            elif attn_mask.dim() == 3:
-                if next(self.parameters()).dtype == torch.float16:
-                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score)
-                else:
-                    attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score)
-
-        # [qlen x klen x bsz x n_head]
-        attn_prob = F.softmax(attn_score, dim=1)
-        attn_prob = self.dropatt(attn_prob)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_prob = attn_prob * head_mask
-
-        # compute attention vector
-        attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
-
-        # [qlen x bsz x n_head x d_head]
-        attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
-
-        # linear projection
-        attn_out = self.o_net(attn_vec)
-        attn_out = self.drop(attn_out)
-
-        if self.pre_lnorm:
-            # residual connection
-            outputs = [w + attn_out]
-        else:
-            # residual connection + layer normalization
-            outputs = [self.layer_norm(w + attn_out)]
-
-        if self.output_attentions:
-            outputs.append(attn_prob)
-
-        return outputs
-
-
-class RelPartialLearnableDecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
-        super().__init__()
-
-        self.dec_attn = RelPartialLearnableMultiHeadAttn(
-            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
-        )
-        self.pos_ff = PositionwiseFF(
-            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
-        )
-
-    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
-
-        attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask)
-        ff_output = self.pos_ff(attn_outputs[0])
-
-        outputs = [ff_output] + attn_outputs[1:]
-
-        return outputs
-
-
-class AdaptiveEmbedding(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
-        super().__init__()
-
-        self.n_token = n_token
-        self.d_embed = d_embed
-
-        self.cutoffs = cutoffs + [n_token]
-        self.div_val = div_val
-        self.d_proj = d_proj
-
-        self.emb_scale = d_proj ** 0.5
-
-        self.cutoff_ends = [0] + self.cutoffs
-
-        self.emb_layers = nn.ModuleList()
-        self.emb_projs = nn.ParameterList()
-        if div_val == 1:
-            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
-            if d_proj != d_embed:
-                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
-        else:
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                d_emb_i = d_embed // (div_val ** i)
-                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
-                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
-
-    def forward(self, inp):
-        if self.div_val == 1:
-            embed = self.emb_layers[0](inp)
-            if self.d_proj != self.d_embed:
-                embed = F.linear(embed, self.emb_projs[0])
-        else:
-            param = next(self.parameters())
-            inp_flat = inp.view(-1)
-            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-
-                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
-                indices_i = mask_i.nonzero().squeeze()
-
-                if indices_i.numel() == 0:
-                    continue
-
-                inp_i = inp_flat.index_select(0, indices_i) - l_idx
-                emb_i = self.emb_layers[i](inp_i)
-                emb_i = F.linear(emb_i, self.emb_projs[i])
-
-                emb_flat.index_copy_(0, indices_i, emb_i)
-
-            embed_shape = inp.size() + (self.d_proj,)
-            embed = emb_flat.view(embed_shape)
-
-        embed.mul_(self.emb_scale)
-
-        return embed
-
-
-class TransfoXLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = TransfoXLConfig
-    pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_transfo_xl
-    base_model_prefix = "transformer"
-
-    def _init_weight(self, weight):
-        if self.config.init == "uniform":
-            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
-        elif self.config.init == "normal":
-            nn.init.normal_(weight, 0.0, self.config.init_std)
-
-    def _init_bias(self, bias):
-        nn.init.constant_(bias, 0.0)
-
-    def _init_weights(self, m):
-        """ Initialize the weights.
-        """
-        classname = m.__class__.__name__
-        if classname.find("Linear") != -1:
-            if hasattr(m, "weight") and m.weight is not None:
-                self._init_weight(m.weight)
-            if hasattr(m, "bias") and m.bias is not None:
-                self._init_bias(m.bias)
-        elif classname.find("AdaptiveEmbedding") != -1:
-            if hasattr(m, "emb_projs"):
-                for i in range(len(m.emb_projs)):
-                    if m.emb_projs[i] is not None:
-                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find("Embedding") != -1:
-            if hasattr(m, "weight"):
-                self._init_weight(m.weight)
-        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
-            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
-                self._init_weight(m.cluster_weight)
-            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
-                self._init_bias(m.cluster_bias)
-            if hasattr(m, "out_projs"):
-                for i in range(len(m.out_projs)):
-                    if m.out_projs[i] is not None:
-                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find("LayerNorm") != -1:
-            if hasattr(m, "weight"):
-                nn.init.normal_(m.weight, 1.0, self.config.init_std)
-            if hasattr(m, "bias") and m.bias is not None:
-                self._init_bias(m.bias)
-        else:
-            if hasattr(m, "r_emb"):
-                self._init_weight(m.r_emb)
-            if hasattr(m, "r_w_bias"):
-                self._init_weight(m.r_w_bias)
-            if hasattr(m, "r_r_bias"):
-                self._init_weight(m.r_r_bias)
-            if hasattr(m, "r_bias"):
-                self._init_bias(m.r_bias)
-
-
-TRANSFO_XL_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-TRANSFO_XL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
-            given to this model should not be passed as input ids as they have already been computed.
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    TRANSFO_XL_START_DOCSTRING,
-)
-class TransfoXLModel(TransfoXLPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.n_token = config.vocab_size
-
-        self.d_embed = config.d_embed
-        self.d_model = config.d_model
-        self.n_head = config.n_head
-        self.d_head = config.d_head
-
-        self.word_emb = AdaptiveEmbedding(
-            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
-        )
-
-        self.drop = nn.Dropout(config.dropout)
-
-        self.n_layer = config.n_layer
-
-        self.tgt_len = config.tgt_len
-        self.mem_len = config.mem_len
-        self.ext_len = config.ext_len
-        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
-
-        self.attn_type = config.attn_type
-
-        if not config.untie_r:
-            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-
-        self.layers = nn.ModuleList()
-        if config.attn_type == 0:  # the default attention
-            for i in range(config.n_layer):
-                self.layers.append(
-                    RelPartialLearnableDecoderLayer(
-                        config.n_head,
-                        config.d_model,
-                        config.d_head,
-                        config.d_inner,
-                        config.dropout,
-                        tgt_len=config.tgt_len,
-                        ext_len=config.ext_len,
-                        mem_len=config.mem_len,
-                        dropatt=config.dropatt,
-                        pre_lnorm=config.pre_lnorm,
-                        r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias,
-                        output_attentions=self.output_attentions,
-                        layer_norm_epsilon=config.layer_norm_epsilon,
-                    )
-                )
-        else:  # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
-            raise NotImplementedError  # Removed them to avoid maintaining dead code
-
-        self.same_length = config.same_length
-        self.clamp_len = config.clamp_len
-
-        if self.attn_type == 0:  # default attention
-            self.pos_emb = PositionalEmbedding(self.d_model)
-        else:  # learnable embeddings and absolute embeddings
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.word_emb
-
-    def set_input_embeddings(self, new_embeddings):
-        self.word_emb = new_embeddings
-
-    def backward_compatible(self):
-        self.sample_softmax = -1
-
-    def reset_length(self, tgt_len, ext_len, mem_len):
-        self.tgt_len = tgt_len
-        self.mem_len = mem_len
-        self.ext_len = ext_len
-
-    def _prune_heads(self, heads):
-        logger.info("Head pruning is not implemented for Transformer-XL model")
-        pass
-
-    def init_mems(self, bsz):
-        if self.mem_len > 0:
-            mems = []
-            param = next(self.parameters())
-            for i in range(self.n_layer):
-                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
-                mems.append(empty)
-
-            return mems
-        else:
-            return None
-
-    def _update_mems(self, hids, mems, qlen, mlen):
-        # does not deal with None
-        if mems is None:
-            return None
-
-        # mems is not None
-        assert len(hids) == len(mems), "len(hids) != len(mems)"
-
-        # There are `mlen + qlen` steps that can be cached into mems
-        # For the next step, the last `ext_len` of the `qlen` tokens
-        # will be used as the extended context. Hence, we only cache
-        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
-        # to `mlen + qlen - self.ext_len`.
-        with torch.no_grad():
-            new_mems = []
-            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
-            beg_idx = max(0, end_idx - self.mem_len)
-            for i in range(len(hids)):
-
-                cat = torch.cat([mems[i], hids[i]], dim=0)
-                new_mems.append(cat[beg_idx:end_idx].detach())
-
-        return new_mems
-
-    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
-        """
-        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
-        # so we transpose here from shape [bsz, len] to shape [len, bsz]
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_ids = input_ids.transpose(0, 1).contiguous()
-            qlen, bsz = input_ids.size()
-        elif inputs_embeds is not None:
-            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
-            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if mems is None:
-            mems = self.init_mems(bsz)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
-        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
-                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.n_layer
-
-        if inputs_embeds is not None:
-            word_emb = inputs_embeds
-        else:
-            word_emb = self.word_emb(input_ids)
-
-        mlen = mems[0].size(0) if mems is not None else 0
-        klen = mlen + qlen
-        if self.same_length:
-            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
-            mask_len = klen - self.mem_len
-            if mask_len > 0:
-                mask_shift_len = qlen - mask_len
-            else:
-                mask_shift_len = qlen
-            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
-        else:
-            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[
-                :, :, None
-            ]
-
-        hids = []
-        attentions = []
-        if self.attn_type == 0:  # default
-            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype)
-            if self.clamp_len > 0:
-                pos_seq.clamp_(max=self.clamp_len)
-            pos_emb = self.pos_emb(pos_seq)
-
-            core_out = self.drop(word_emb)
-            pos_emb = self.drop(pos_emb)
-
-            for i, layer in enumerate(self.layers):
-                hids.append(core_out)
-                mems_i = None if mems is None else mems[i]
-                layer_outputs = layer(
-                    core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i]
-                )
-                core_out = layer_outputs[0]
-                if self.output_attentions:
-                    attentions.append(layer_outputs[1])
-        else:  # learnable embeddings and absolute embeddings
-            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
-
-        core_out = self.drop(core_out)
-
-        new_mems = self._update_mems(hids, mems, mlen, qlen)
-
-        # We transpose back here to shape [bsz, len, hidden_dim]
-        outputs = [core_out.transpose(0, 1).contiguous(), new_mems]
-        if self.output_hidden_states:
-            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
-            hids.append(core_out)
-            hids = list(t.transpose(0, 1).contiguous() for t in hids)
-            outputs.append(hids)
-        if self.output_attentions:
-            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
-            attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
-            outputs.append(attentions)
-
-        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
-
-
-@add_start_docstrings(
-    """The Transformer-XL Model with a language modeling head on top
-    (adaptive softmax with weights tied to the adaptive input embeddings)""",
-    TRANSFO_XL_START_DOCSTRING,
-)
-class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = TransfoXLModel(config)
-        self.sample_softmax = config.sample_softmax
-        # use sampled softmax
-        if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
-            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
-        # use adaptive softmax (including standard softmax)
-        else:
-            self.crit = ProjectedAdaptiveLogSoftmax(
-                config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
-            )
-        self.init_weights()
-
-    def tie_weights(self):
-        """
-        Run this to be sure output and input (adaptive) softmax weights are tied
-        """
-        # sampled softmax
-        if self.sample_softmax > 0:
-            if self.config.tie_weight:
-                self.out_layer.weight = self.transformer.word_emb.weight
-        # adaptive softmax (including standard softmax)
-        else:
-            if self.config.tie_weight:
-                for i in range(len(self.crit.out_layers)):
-                    self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
-            if self.config.tie_projs:
-                for i, tie_proj in enumerate(self.config.tie_projs):
-                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
-                        if self.config.torchscript:
-                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
-                        else:
-                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
-                    elif tie_proj and self.config.div_val != 1:
-                        if self.config.torchscript:
-                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
-                        else:
-                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
-
-    def reset_length(self, tgt_len, ext_len, mem_len):
-        self.transformer.reset_length(tgt_len, ext_len, mem_len)
-
-    def init_mems(self, bsz):
-        return self.transformer.init_mems(bsz)
-
-    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
-        """
-        if input_ids is not None:
-            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
-        elif inputs_embeds is not None:
-            bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds)
-
-        last_hidden = transformer_outputs[0]
-        pred_hid = last_hidden[:, -tgt_len:]
-        outputs = transformer_outputs[1:]
-        if self.sample_softmax > 0 and self.training:
-            assert self.config.tie_weight
-            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, labels, pred_hid, self.sampler)
-            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
-            outputs = [softmax_output] + outputs
-            if labels is not None:
-                # TODO: This is not implemented
-                raise NotImplementedError
-        else:
-            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), labels)
-            if labels is None:
-                softmax_output = softmax_output.view(bsz, tgt_len, -1)
-                outputs = [softmax_output] + outputs
-            else:
-                softmax_output = softmax_output.view(bsz, tgt_len)
-                outputs = [softmax_output, None] + outputs
-
-        return outputs  # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions)
-
-    def get_output_embeddings(self):
-        """ Double-check if you are using adaptive softmax.
-        """
-        if self.sample_softmax > 0:
-            return self.out_layer
-        else:
-            return self.crit.out_layers[-1]
-
-    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
-        inputs = {"input_ids": input_ids}
-
-        # if past is defined in model kwargs then use it for faster decoding
-        if "past" in model_kwargs and model_kwargs["past"]:
-            inputs["mems"] = model_kwargs["past"]
-
-        return inputs
diff --git a/server/transformers/src/transformers/modeling_transfo_xl_utilities.py b/server/transformers/src/transformers/modeling_transfo_xl_utilities.py
deleted file mode 100644
index ef12316673bdb437ea9ac5062a5c48a99748ee11..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_transfo_xl_utilities.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Utilities for PyTorch Transformer XL model.
-    Directly adapted from https://github.com/kimiyoung/transformer-xl.
-"""
-
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
-# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
-
-
-class ProjectedAdaptiveLogSoftmax(nn.Module):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
-        super().__init__()
-
-        self.n_token = n_token
-        self.d_embed = d_embed
-        self.d_proj = d_proj
-
-        self.cutoffs = cutoffs + [n_token]
-        self.cutoff_ends = [0] + self.cutoffs
-        self.div_val = div_val
-
-        self.shortlist_size = self.cutoffs[0]
-        self.n_clusters = len(self.cutoffs) - 1
-        self.head_size = self.shortlist_size + self.n_clusters
-
-        if self.n_clusters > 0:
-            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
-            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
-
-        self.out_layers = nn.ModuleList()
-        self.out_projs = nn.ParameterList()
-
-        if div_val == 1:
-            for i in range(len(self.cutoffs)):
-                if d_proj != d_embed:
-                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
-                else:
-                    self.out_projs.append(None)
-
-            self.out_layers.append(nn.Linear(d_embed, n_token))
-        else:
-            for i in range(len(self.cutoffs)):
-                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                d_emb_i = d_embed // (div_val ** i)
-
-                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
-
-                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
-
-        self.keep_order = keep_order
-
-    def _compute_logit(self, hidden, weight, bias, proj):
-        if proj is None:
-            logit = F.linear(hidden, weight, bias=bias)
-        else:
-            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
-            proj_hid = F.linear(hidden, proj.t().contiguous())
-            logit = F.linear(proj_hid, weight, bias=bias)
-            # else:
-            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
-            #     if bias is not None:
-            #         logit = logit + bias
-
-        return logit
-
-    def forward(self, hidden, labels=None, keep_order=False):
-        """
-            Params:
-                hidden :: [len*bsz x d_proj]
-                labels :: [len*bsz]
-            Return:
-                if labels is None:
-                    out :: [len*bsz] Negative log likelihood
-                else:
-                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
-            We could replace this implementation by the native PyTorch one
-            if their's had an option to set bias on all clusters in the native one.
-            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
-        """
-
-        if labels is not None:
-            labels = labels.view(-1)
-            if hidden.size(0) != labels.size(0):
-                raise RuntimeError("Input and labels should have the same size " "in the batch dimension.")
-
-        if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
-            if labels is not None:
-                out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1)
-            else:
-                out = F.log_softmax(logit, dim=-1)
-        else:
-            # construct weights and biases
-            weights, biases = [], []
-            for i in range(len(self.cutoffs)):
-                if self.div_val == 1:
-                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
-                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
-                else:
-                    weight_i = self.out_layers[i].weight
-                    bias_i = self.out_layers[i].bias
-
-                if i == 0:
-                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
-
-                weights.append(weight_i)
-                biases.append(bias_i)
-
-            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
-
-            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
-            head_logprob = F.log_softmax(head_logit, dim=1)
-
-            if labels is None:
-                out = hidden.new_empty((head_logit.size(0), self.n_token))
-            else:
-                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
-
-            offset = 0
-            cutoff_values = [0] + self.cutoffs
-            for i in range(len(cutoff_values) - 1):
-                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
-
-                if labels is not None:
-                    mask_i = (labels >= l_idx) & (labels < r_idx)
-                    indices_i = mask_i.nonzero().squeeze()
-
-                    if indices_i.numel() == 0:
-                        continue
-
-                    target_i = labels.index_select(0, indices_i) - l_idx
-                    head_logprob_i = head_logprob.index_select(0, indices_i)
-                    hidden_i = hidden.index_select(0, indices_i)
-                else:
-                    hidden_i = hidden
-
-                if i == 0:
-                    if labels is not None:
-                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
-                    else:
-                        out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
-                else:
-                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
-
-                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
-                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
-                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
-                    if labels is not None:
-                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
-                            1, target_i[:, None]
-                        ).squeeze(1)
-                    else:
-                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
-                        out[:, l_idx:r_idx] = logprob_i
-
-                if labels is not None:
-                    if (hasattr(self, "keep_order") and self.keep_order) or keep_order:
-                        out.index_copy_(0, indices_i, -logprob_i)
-                    else:
-                        out[offset : offset + logprob_i.size(0)].copy_(-logprob_i)
-                    offset += logprob_i.size(0)
-
-        return out
-
-    def log_prob(self, hidden):
-        r""" Computes log probabilities for all :math:`n\_classes`
-        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
-        Args:
-            hidden (Tensor): a minibatch of examples
-        Returns:
-            log-probabilities of for each class :math:`c`
-            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
-            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
-        Shape:
-            - Input: :math:`(N, in\_features)`
-            - Output: :math:`(N, n\_classes)`
-        """
-        if self.n_clusters == 0:
-            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
-            return F.log_softmax(logit, dim=-1)
-        else:
-            # construct weights and biases
-            weights, biases = [], []
-            for i in range(len(self.cutoffs)):
-                if self.div_val == 1:
-                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
-                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
-                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
-                else:
-                    weight_i = self.out_layers[i].weight
-                    bias_i = self.out_layers[i].bias
-
-                if i == 0:
-                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
-                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
-
-                weights.append(weight_i)
-                biases.append(bias_i)
-
-            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
-            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
-
-            out = hidden.new_empty((head_logit.size(0), self.n_token))
-            head_logprob = F.log_softmax(head_logit, dim=1)
-
-            cutoff_values = [0] + self.cutoffs
-            for i in range(len(cutoff_values) - 1):
-                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
-
-                if i == 0:
-                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
-                else:
-                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
-
-                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
-                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
-
-                    logprob_i = head_logprob[:, -i] + tail_logprob_i
-                    out[:, start_idx, stop_idx] = logprob_i
-
-            return out
-
-
-class LogUniformSampler(object):
-    def __init__(self, range_max, n_sample):
-        """
-        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
-            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
-
-        expected count can be approximated by 1 - (1 - p)^n
-        and we use a numerically stable version -expm1(num_tries * log1p(-p))
-
-        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
-        """
-        with torch.no_grad():
-            self.range_max = range_max
-            log_indices = torch.arange(1.0, range_max + 2.0, 1.0).log_()
-            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
-
-            self.log_q = (-(-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
-
-        self.n_sample = n_sample
-
-    def sample(self, labels):
-        """
-            labels: [b1, b2]
-        Return
-            true_log_probs: [b1, b2]
-            samp_log_probs: [n_sample]
-            neg_samples: [n_sample]
-        """
-
-        # neg_samples = torch.empty(0).long()
-        n_sample = self.n_sample
-        n_tries = 2 * n_sample
-
-        with torch.no_grad():
-            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
-            device = labels.device
-            neg_samples = neg_samples.to(device)
-            true_log_probs = self.log_q[labels].to(device)
-            samp_log_probs = self.log_q[neg_samples].to(device)
-            return true_log_probs, samp_log_probs, neg_samples
-
-
-def sample_logits(embedding, bias, labels, inputs, sampler):
-    """
-        embedding: an nn.Embedding layer
-        bias: [n_vocab]
-        labels: [b1, b2]
-        inputs: [b1, b2, n_emb]
-        sampler: you may use a LogUniformSampler
-    Return
-        logits: [b1, b2, 1 + n_sample]
-    """
-    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
-    n_sample = neg_samples.size(0)
-    b1, b2 = labels.size(0), labels.size(1)
-    all_ids = torch.cat([labels.view(-1), neg_samples])
-    all_w = embedding(all_ids)
-    true_w = all_w[:-n_sample].view(b1, b2, -1)
-    sample_w = all_w[-n_sample:].view(n_sample, -1)
-
-    all_b = bias[all_ids]
-    true_b = all_b[:-n_sample].view(b1, b2)
-    sample_b = all_b[-n_sample:]
-
-    hit = (labels[:, :, None] == neg_samples).detach()
-
-    true_logits = torch.einsum("ijk,ijk->ij", [true_w, inputs]) + true_b - true_log_probs
-    sample_logits = torch.einsum("lk,ijk->ijl", [sample_w, inputs]) + sample_b - samp_log_probs
-    sample_logits.masked_fill_(hit, -1e30)
-    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
-
-    return logits
diff --git a/server/transformers/src/transformers/modeling_utils.py b/server/transformers/src/transformers/modeling_utils.py
deleted file mode 100644
index 7edfa7f0b3b82959b9205e2b6cd6b9274a380512..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_utils.py
+++ /dev/null
@@ -1,1517 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model."""
-
-
-import logging
-import os
-from itertools import zip_longest
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-
-from .configuration_utils import PretrainedConfig
-from .file_utils import (
-    DUMMY_INPUTS,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_remote_url,
-)
-
-
-logger = logging.getLogger(__name__)
-
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super().__init__()
-
-        def forward(self, input):
-            return input
-
-
-class ModuleUtilsMixin:
-    """
-    A few utilities for torch.nn.Modules, to be used as a mixin.
-    """
-
-    def num_parameters(self, only_trainable: bool = False) -> int:
-        """
-        Get number of (optionally, trainable) parameters in the module.
-        """
-        params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters()
-        return sum(p.numel() for p in params)
-
-
-class PreTrainedModel(nn.Module, ModuleUtilsMixin):
-    r""" Base class for all models.
-
-        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
-
-        Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
-            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
-
-                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
-                - ``path``: a path (string) to the TensorFlow checkpoint.
-
-            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
-    """
-    config_class = None
-    pretrained_model_archive_map = {}
-    base_model_prefix = ""
-
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to do a forward pass in the network.
-
-        Returns:
-            torch.Tensor with dummy inputs
-        """
-        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config in model
-        self.config = config
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix, self)
-
-    def get_input_embeddings(self):
-        """
-        Returns the model's input embeddings.
-
-        Returns:
-            :obj:`nn.Module`:
-                A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
-
-    def set_input_embeddings(self, value):
-        """
-        Set model's input embeddings
-
-        Args:
-            value (:obj:`nn.Module`):
-                A module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            base_model.set_input_embeddings(value)
-        else:
-            raise NotImplementedError
-
-    def get_output_embeddings(self):
-        """
-        Returns the model's output embeddings.
-
-        Returns:
-            :obj:`nn.Module`:
-                A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
-
-    def tie_weights(self):
-        """
-        Tie the weights between the input embeddings and the output embeddings.
-        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
-        the weights instead.
-        """
-        output_embeddings = self.get_output_embeddings()
-        if output_embeddings is not None:
-            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
-
-    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """ Tie or clone module weights depending of weither we are using TorchScript or not
-        """
-        if self.config.torchscript:
-            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
-        else:
-            output_embeddings.weight = input_embeddings.weight
-
-        if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None:
-            output_embeddings.bias.data = torch.nn.functional.pad(
-                output_embeddings.bias.data,
-                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
-                "constant",
-                0,
-            )
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-
-    def resize_token_embeddings(self, new_num_tokens=None):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
-
-        Arguments:
-
-            new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
-                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
-
-        Return: ``torch.nn.Embeddings``
-            Pointer to the input tokens Embeddings Module of the model
-        """
-        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
-            return model_embeds
-
-        # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        base_model.vocab_size = new_num_tokens
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.set_input_embeddings(new_embeddings)
-        return self.get_input_embeddings()
-
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Module from a provided token Embedding Module.
-            Increasing the size will add newly initialized vectors at the end
-            Reducing the size will remove vectors from the end
-
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``torch.nn.Embeddings``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
-        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device)
-
-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
-
-        # Copy word embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
-
-        return new_embeddings
-
-    def init_weights(self):
-        """ Initialize and prunes weights if needed. """
-        # Initialize weights
-        self.apply(self._init_weights)
-
-        # Prune heads if needed
-        if self.config.pruned_heads:
-            self.prune_heads(self.config.pruned_heads)
-
-        # Tie weights if needed
-        self.tie_weights()
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the base model.
-
-            Arguments:
-
-                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
-                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        """
-        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
-        for layer, heads in heads_to_prune.items():
-            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
-            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
-
-        self.base_model._prune_heads(heads_to_prune)
-
-    def save_pretrained(self, save_directory):
-        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # Only save the model itself if we are using distributed training
-        model_to_save = self.module if hasattr(self, "module") else self
-
-        # Attach architecture to the config
-        model_to_save.config.architectures = [model_to_save.__class__.__name__]
-
-        # Save configuration file
-        model_to_save.config.save_pretrained(save_directory)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-        torch.save(model_to_save.state_dict(), output_model_file)
-        logger.info("Model weights saved in {}".format(output_model_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
-
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with ``model.train()``
-
-        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
-        It is up to you to train those weights with a downstream fine-tuning task.
-
-        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path: either:
-              - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-              - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-              - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-              - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-              - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) one of:
-                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
-                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-                    - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                    - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                    - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            elif os.path.isdir(pretrained_model_name_or_path):
-                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
-                    # Load from a TF 1.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
-                    # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
-                    from_tf
-                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
-                if from_tf:
-                    raise EnvironmentError(
-                        "Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name."
-                    )
-
-            # redirect to the cache, if necessary
-            try:
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                )
-            except EnvironmentError:
-                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)
-                else:
-                    msg = (
-                        "Model name '{}' was not found in model name list ({}). "
-                        "We assumed '{}' was a path or url to model weight files named one of {} but "
-                        "couldn't find any such file at this path or url.".format(
-                            pretrained_model_name_or_path,
-                            ", ".join(cls.pretrained_model_archive_map.keys()),
-                            archive_file,
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME],
-                        )
-                    )
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                logger.info("loading weights file {}".format(archive_file))
-            else:
-                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if state_dict is None and not from_tf:
-            try:
-                state_dict = torch.load(resolved_archive_file, map_location="cpu")
-            except Exception:
-                raise OSError(
-                    "Unable to load weights from pytorch checkpoint file. "
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        if from_tf:
-            if resolved_archive_file.endswith(".index"):
-                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
-            else:
-                # Load from our TensorFlow 2.0 checkpoints
-                try:
-                    from transformers import load_tf2_checkpoint_in_pytorch_model
-
-                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
-                except ImportError:
-                    logger.error(
-                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-                    )
-                    raise
-        else:
-            # Convert old format to new format if needed from a PyTorch state_dict
-            old_keys = []
-            new_keys = []
-            for key in state_dict.keys():
-                new_key = None
-                if "gamma" in key:
-                    new_key = key.replace("gamma", "weight")
-                if "beta" in key:
-                    new_key = key.replace("beta", "bias")
-                if new_key:
-                    old_keys.append(key)
-                    new_keys.append(new_key)
-            for old_key, new_key in zip(old_keys, new_keys):
-                state_dict[new_key] = state_dict.pop(old_key)
-
-            # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, "_metadata", None)
-            state_dict = state_dict.copy()
-            if metadata is not None:
-                state_dict._metadata = metadata
-
-            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-            # so we need to apply the function recursively.
-            def load(module: nn.Module, prefix=""):
-                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-                module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-                )
-                for name, child in module._modules.items():
-                    if child is not None:
-                        load(child, prefix + name + ".")
-
-            # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ""
-            model_to_load = model
-            if not hasattr(model, cls.base_model_prefix) and any(
-                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
-            ):
-                start_prefix = cls.base_model_prefix + "."
-            if hasattr(model, cls.base_model_prefix) and not any(
-                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
-            ):
-                model_to_load = getattr(model, cls.base_model_prefix)
-
-            load(model_to_load, prefix=start_prefix)
-            if len(missing_keys) > 0:
-                logger.info(
-                    "Weights of {} not initialized from pretrained model: {}".format(
-                        model.__class__.__name__, missing_keys
-                    )
-                )
-            if len(unexpected_keys) > 0:
-                logger.info(
-                    "Weights from pretrained model not used in {}: {}".format(
-                        model.__class__.__name__, unexpected_keys
-                    )
-                )
-            if len(error_msgs) > 0:
-                raise RuntimeError(
-                    "Error(s) in loading state_dict for {}:\n\t{}".format(
-                        model.__class__.__name__, "\n\t".join(error_msgs)
-                    )
-                )
-
-        model.tie_weights()  # make sure word embedding weights are still tied if needed
-
-        # Set model in evaluation mode to desactivate DropOut modules by default
-        model.eval()
-
-        if output_loading_info:
-            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
-            return model, loading_info
-
-        return model
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {"input_ids": input_ids}
-
-    def _do_output_past(self, outputs):
-        has_output_past = hasattr(self.config, "output_past") and self.config.output_past
-        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
-
-        if has_output_past and not has_mem_len and len(outputs) > 1:
-            return True
-        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
-            return True
-
-        return False
-
-    @torch.no_grad()
-    def generate(
-        self,
-        input_ids=None,
-        max_length=None,
-        do_sample=None,
-        num_beams=None,
-        temperature=None,
-        top_k=None,
-        top_p=None,
-        repetition_penalty=None,
-        bos_token_id=None,
-        pad_token_id=None,
-        eos_token_ids=None,
-        length_penalty=None,
-        num_return_sequences=None,
-    ):
-        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
-        and beam-search.
-
-        Adapted in part from `Facebook's XLM beam search code`_.
-
-        .. _`Facebook's XLM beam search code`:
-           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
-
-
-        Parameters:
-
-            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
-                The sequence used as a prompt for the generation. If `None` the method initializes
-                it as an empty `torch.LongTensor` of shape `(1,)`.
-
-            max_length: (`optional`) int
-                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
-
-            do_sample: (`optional`) bool
-                If set to `False` greedy decoding is used. Otherwise sampling is used. Default to greedy sampling.
-
-            num_beams: (`optional`) int
-                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
-
-            temperature: (`optional`) float
-                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
-
-            top_k: (`optional`) int
-                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-
-            top_p: (`optional`) float
-                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-
-            repetition_penalty: (`optional`) float
-                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
-
-            bos_token_id: (`optional`) int
-                Beginning of sentence token if no prompt is provided. Default to 0.
-
-            eos_token_ids: (`optional`) int or list of int
-                End of sequence token or list of tokens to stop the generation. Default to 0.
-            length_penalty: (`optional`) float
-                Exponential penalty to the length. Default to 1.
-
-            num_return_sequences: (`optional`) int
-                The number of independently computed returned sequences for each element in the batch. Default to 1.
-
-        Examples::
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            outputs = model.generate(max_length=40, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id)  # do greedy decoding without beam search
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, do_sample=True, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[0][i], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id, num_beams=3)  # generate sequences using greedy beam search decoding (3 beams)
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences using using greedy search
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-        """
-
-        # We cannot generate if the model does not have a LM head
-        if self.get_output_embeddings() is None:
-            raise AttributeError(
-                "You tried to generate sequences with a model that does not have a LM Head."
-                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`)"
-            )
-
-        max_length = max_length if max_length is not None else self.config.max_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = temperature if temperature is not None else self.config.temperature
-        top_k = top_k if top_k is not None else self.config.top_k
-        top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
-        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
-        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        num_return_sequences = (
-            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
-        )
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]  # overriden by the input batch_size
-        else:
-            batch_size = 1
-        if isinstance(eos_token_ids, int):
-            eos_token_ids = [eos_token_ids]
-
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
-        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
-        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
-        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
-        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
-        assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer."
-        assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer."
-        assert isinstance(eos_token_ids, (list, tuple)) and (
-            e >= 0 for e in eos_token_ids
-        ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
-        assert (
-            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
-
-        if input_ids is None:
-            input_ids = torch.full(
-                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device
-            )
-        else:
-            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
-
-        # current position and vocab size
-        cur_len = input_ids.shape[1]
-        vocab_size = self.config.vocab_size
-
-        if num_return_sequences != 1:
-            # Expand input to num return sequences
-            input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
-            input_ids = input_ids.contiguous().view(
-                batch_size * num_return_sequences, cur_len
-            )  # (batch_size * num_return_sequences, cur_len)
-            effective_batch_size = batch_size * num_return_sequences
-        else:
-            effective_batch_size = batch_size
-
-        if num_beams > 1:
-            output = self._generate_beam_search(
-                input_ids,
-                cur_len,
-                max_length,
-                do_sample,
-                temperature,
-                top_k,
-                top_p,
-                repetition_penalty,
-                pad_token_id,
-                eos_token_ids,
-                effective_batch_size,
-                length_penalty,
-                num_beams,
-                vocab_size,
-            )
-        else:
-            output = self._generate_no_beam_search(
-                input_ids,
-                cur_len,
-                max_length,
-                do_sample,
-                temperature,
-                top_k,
-                top_p,
-                repetition_penalty,
-                pad_token_id,
-                eos_token_ids,
-                effective_batch_size,
-            )
-
-        if num_return_sequences != 1:
-            output = output.view(batch_size, num_return_sequences, -1)
-        return output
-
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        pad_token_id,
-        eos_token_ids,
-        batch_size,
-    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
-        """
-        # current position / max lengths / length of generated sentences / unfinished sentences
-        unfinished_sents = input_ids.new(batch_size).fill_(1)
-
-        past = None
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
-            outputs = self(**model_inputs)
-            next_token_logits = outputs[0][:, -1, :]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                for i in range(batch_size):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if next_token_logits[i, previous_token] < 0:
-                            next_token_logits[i, previous_token] *= repetition_penalty
-                        else:
-                            next_token_logits[i, previous_token] /= repetition_penalty
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    next_token_logits = next_token_logits / temperature
-                # Top-p/top-k filtering
-                next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-                # Sample
-                next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1).squeeze(1)
-            else:
-                # Greedy decoding
-                next_token = torch.argmax(next_token_logits, dim=-1)
-
-            # update generations and finished sentences
-            tokens_to_add = next_token * unfinished_sents + pad_token_id * (1 - unfinished_sents)
-            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
-            for eos_token_id in eos_token_ids:
-                unfinished_sents.mul_(tokens_to_add.ne(eos_token_id).long())
-            cur_len = cur_len + 1
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if unfinished_sents.max() == 0:
-                break
-
-        # add eos_token_ids to unfinished sentences
-        if cur_len == max_length:
-            input_ids[:, -1].masked_fill_(unfinished_sents.to(dtype=torch.bool), eos_token_ids[0])
-
-        return input_ids
-
-    def _generate_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        pad_token_id,
-        eos_token_ids,
-        batch_size,
-        length_penalty,
-        num_beams,
-        vocab_size,
-    ):
-        """ Generate sequences for each example with beam search.
-        """
-        # Expand input to num beams
-        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
-        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)  # (batch_size * num_beams, cur_len)
-
-        # generated hypotheses
-        generated_hyps = [
-            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)
-        ]
-
-        # scores for each sentence in the beam
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
-
-        # cache compute states
-        past = None
-
-        # done sentences
-        done = [False for _ in range(batch_size)]
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
-            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-            scores = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                for i in range(batch_size * num_beams):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if scores[i, previous_token] < 0:
-                            scores[i, previous_token] *= repetition_penalty
-                        else:
-                            scores[i, previous_token] /= repetition_penalty
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    scores = scores / temperature
-                # Top-p/top-k filtering
-                scores = top_k_top_p_filtering(
-                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
-                )  # (batch_size * num_beams, vocab_size)
-                # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)  # (batch_size * num_beams, 2)
-                # Compute next scores
-                _scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
-                _scores = torch.gather(_scores, -1, next_words)  # (batch_size * num_beams, 2)
-                next_scores = _scores + beam_scores[:, None].expand_as(_scores)  # (batch_size * num_beams, 2)
-                # Match shape of greedy beam search
-                next_words = next_words.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
-                next_scores = next_scores.view(batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
-            else:
-                # do greedy beam search
-                scores = F.log_softmax(scores, dim=-1)  # (batch_size * num_beams, vocab_size)
-                assert scores.size() == (batch_size * num_beams, vocab_size)
-                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(batch_size, num_beams * vocab_size)  # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
-
-            assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
-
-            # next batch beam content
-            # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
-            next_batch_beam = []
-
-            # for each sentence
-            for batch_ex in range(batch_size):
-
-                # if we are done with this sentence
-                done[batch_ex] = done[batch_ex] or generated_hyps[batch_ex].is_done(next_scores[batch_ex].max().item())
-                if done[batch_ex]:
-                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
-                    continue
-
-                # next sentence beam content
-                next_sent_beam = []
-
-                # next words for this sentence
-                for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]):
-
-                    # get beam and word IDs
-                    beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
-
-                    # end of sentence, or next word
-                    if word_id.item() in eos_token_ids or cur_len + 1 == max_length:
-                        generated_hyps[batch_ex].add(
-                            input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()
-                        )
-                    else:
-                        next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id))
-
-                    # the beam for next step is full
-                    if len(next_sent_beam) == num_beams:
-                        break
-
-                # update next beam content
-                assert len(next_sent_beam) == 0 if cur_len + 1 == max_length else num_beams
-                if len(next_sent_beam) == 0:
-                    next_sent_beam = [(0, pad_token_id, 0)] * num_beams  # pad the batch
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (batch_ex + 1)
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == batch_size * num_beams
-            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_words = input_ids.new([x[1] for x in next_batch_beam])
-            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
-
-            # re-order batch
-            input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)
-
-            # re-order internal states
-            if past:
-                reordered_past = []
-                for layer_past in past:
-                    # get the correct batch idx from layer past batch dim
-                    # batch dim of `past` and `mems` is at 2nd position
-                    reordered_layer_past = [layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx]
-                    reordered_layer_past = torch.cat(reordered_layer_past, dim=1)
-                    # check that shape matches
-                    assert reordered_layer_past.shape == layer_past.shape
-                    reordered_past.append(reordered_layer_past)
-                past = tuple(reordered_past)
-
-            # update current length
-            cur_len = cur_len + 1
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-        # visualize hypotheses
-        # print([len(x) for x in generated_hyps], cur_len)
-        # globals().update( locals() );
-        # !import code; code.interact(local=vars())
-        # for ii in range(batch_size):
-        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
-        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
-        #     print("")
-
-        # select the best hypotheses
-        tgt_len = input_ids.new(batch_size)
-        best = []
-
-        for i, hypotheses in enumerate(generated_hyps):
-            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
-            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
-            best.append(best_hyp)
-
-        # generate target batch
-        decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id)
-        for i, hypo in enumerate(best):
-            decoded[i, : tgt_len[i] - 1] = hypo
-            decoded[i, tgt_len[i] - 1] = eos_token_ids[0]
-
-        return decoded
-
-
-def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        logits[indices_to_remove] = filter_value
-    return logits
-
-
-class BeamHypotheses(object):
-    def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.n_hyp = n_hyp
-        self.hyp = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.hyp)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp) ** self.length_penalty
-        if len(self) < self.n_hyp or score > self.worst_score:
-            self.hyp.append((score, hyp))
-            if len(self) > self.n_hyp:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
-                del self.hyp[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-        if len(self) < self.n_hyp:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, nx):
-        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
-            Basically works like a Linear layer but the weights are transposed
-        """
-        super().__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class PoolerStartLogits(nn.Module):
-    """ Compute SQuAD start_logits from sequence hidden states. """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-
-    def forward(self, hidden_states, p_mask=None):
-        """ Args:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
-                invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-
-    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
-        """ Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-
-            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-
-    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
-        """
-        Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-
-            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span.
-            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-                position of the CLS token. If None, take the last token.
-
-            note(Original repo):
-                no dependency on end_feature so that we can obtain one single `cls_logits`
-                for each sample
-        """
-        hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-class SQuADHead(nn.Module):
-    r""" A SQuAD head inspired by XLNet.
-
-    Parameters:
-        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-
-    Inputs:
-        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
-            hidden states of sequence tokens
-        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the first token for the labeled span.
-        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the last token for the labeled span.
-        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-            position of the CLS token. If None, take the last token.
-        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            Whether the question has a possible answer in the paragraph or not.
-        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-            1.0 means token should be masked.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size,)``
-            Log probabilities for the ``is_impossible`` label of the answers.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-    def forward(
-        self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None
-    ):
-        outputs = ()
-
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            outputs = (total_loss,) + outputs
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
-
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        # or (if labels are provided) (total_loss,)
-        return outputs
-
-
-class SequenceSummary(nn.Module):
-    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
-        Args of the config class:
-            summary_type:
-                - 'last' => [default] take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj: Add a projection after the vector extraction
-            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
-            summary_first_dropout: Add a dropout before the projection and activation
-            summary_last_dropout: Add a dropout after the projection and activation
-    """
-
-    def __init__(self, config):
-        super().__init__()
-
-        self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        self.activation = Identity()
-        if hasattr(config, "summary_activation") and config.summary_activation == "tanh":
-            self.activation = nn.Tanh()
-
-        self.first_dropout = Identity()
-        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-    def forward(self, hidden_states, cls_index=None):
-        """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
-            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
-                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'cls_index' and cls_index is None:
-                    we take the last token of the sequence as classification token
-        """
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == "cls_index":
-            if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long)
-            else:
-                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
-def prune_linear_layer(layer, index, dim=0):
-    """ Prune a linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_conv1d_layer(layer, index, dim=1):
-    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
-        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_layer(layer, index, dim=None):
-    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    if isinstance(layer, nn.Linear):
-        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
-    elif isinstance(layer, Conv1D):
-        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
-    else:
-        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
-
-def transpose_iterable(ls):
-    """Transpose a list of lists (or tuple of identically lengthed tuples)"""
-    tp = type(ls)
-    if len(ls) > 0: assert type(ls[0]) == tp, f"Expected type {tp}, instead got type {type(ls[0])} inside outer list"
-
-    return tp(map(tp, zip_longest(*ls)))
\ No newline at end of file
diff --git a/server/transformers/src/transformers/modeling_xlm.py b/server/transformers/src/transformers/modeling_xlm.py
deleted file mode 100644
index 9ba5540f9c8ea98af1d248e2708ebfd12094d576..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_xlm.py
+++ /dev/null
@@ -1,1052 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch XLM model.
-"""
-
-
-import itertools
-import logging
-import math
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.nn import functional as F
-
-from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer
-
-
-logger = logging.getLogger(__name__)
-
-XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
-    "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
-    "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
-    "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
-    "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
-    "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
-    "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
-    "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
-    "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
-    "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
-}
-
-
-def create_sinusoidal_embeddings(n_pos, dim, out):
-    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
-    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
-    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-    out.detach_()
-    out.requires_grad = False
-
-
-def gelu(x):
-    """
-    GELU activation
-    https://arxiv.org/abs/1606.08415
-    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/transformers/blob/master/modeling.py
-    """
-    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def get_masks(slen, lengths, causal, padding_mask=None):
-    """
-    Generate hidden states mask, and optionally an attention mask.
-    """
-    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
-    if padding_mask is not None:
-        mask = padding_mask
-    else:
-        assert lengths.max().item() <= slen
-        mask = alen < lengths[:, None]
-
-    # attention mask is the same as mask, or triangular inferior attention (causal)
-    bs = lengths.size(0)
-    if causal:
-        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
-    else:
-        attn_mask = mask
-
-    # sanity check
-    assert mask.size() == (bs, slen)
-    assert causal is False or attn_mask.size() == (bs, slen, slen)
-
-    return mask, attn_mask
-
-
-class MultiHeadAttention(nn.Module):
-
-    NEW_ID = itertools.count()
-
-    def __init__(self, n_heads, dim, config):
-        super().__init__()
-        self.layer_id = next(MultiHeadAttention.NEW_ID)
-        self.output_attentions = config.output_attentions
-        self.dim = dim
-        self.n_heads = n_heads
-        self.dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0
-
-        self.q_lin = nn.Linear(dim, dim)
-        self.k_lin = nn.Linear(dim, dim)
-        self.v_lin = nn.Linear(dim, dim)
-        self.out_lin = nn.Linear(dim, dim)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        attention_head_size = self.dim // self.n_heads
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_heads, attention_head_size)
-        heads = set(heads) - self.pruned_heads
-        for head in heads:
-            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        # Prune linear layers
-        self.q_lin = prune_linear_layer(self.q_lin, index)
-        self.k_lin = prune_linear_layer(self.k_lin, index)
-        self.v_lin = prune_linear_layer(self.v_lin, index)
-        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.dim = attention_head_size * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(self, input, mask, kv=None, cache=None, head_mask=None):
-        """
-        Self-attention (if kv is None) or attention over source sentence (provided by kv).
-        """
-        # Input is (bs, qlen, dim)
-        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
-        bs, qlen, dim = input.size()
-        if kv is None:
-            klen = qlen if cache is None else cache["slen"] + qlen
-        else:
-            klen = kv.size(1)
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        n_heads = self.n_heads
-        dim_per_head = self.dim // n_heads
-        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
-
-        def shape(x):
-            """  projection """
-            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
-
-        def unshape(x):
-            """  compute context """
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
-
-        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        if kv is None:
-            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
-        elif cache is None or self.layer_id not in cache:
-            k = v = kv
-            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
-            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
-
-        if cache is not None:
-            if self.layer_id in cache:
-                if kv is None:
-                    k_, v_ = cache[self.layer_id]
-                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
-                else:
-                    k, v = cache[self.layer_id]
-            cache[self.layer_id] = (k, v)
-
-        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
-        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
-        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
-        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, qlen, klen)
-
-        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
-        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            weights = weights * head_mask
-
-        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
-        context = unshape(context)  # (bs, qlen, dim)
-
-        outputs = (self.out_lin(context),)
-        if self.output_attentions:
-            outputs = outputs + (weights,)
-        return outputs
-
-
-class TransformerFFN(nn.Module):
-    def __init__(self, in_dim, dim_hidden, out_dim, config):
-        super().__init__()
-        self.dropout = config.dropout
-        self.lin1 = nn.Linear(in_dim, dim_hidden)
-        self.lin2 = nn.Linear(dim_hidden, out_dim)
-        self.act = gelu if config.gelu_activation else F.relu
-
-    def forward(self, input):
-        x = self.lin1(input)
-        x = self.act(x)
-        x = self.lin2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        return x
-
-
-class XLMPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XLMConfig
-    pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = None
-    base_model_prefix = "transformer"
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    @property
-    def dummy_inputs(self):
-        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
-        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-        if self.config.use_lang_emb and self.config.n_langs > 1:
-            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
-        else:
-            langs_list = None
-        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
-
-    def _init_weights(self, module):
-        """ Initialize the weights. """
-        if isinstance(module, nn.Embedding):
-            if self.config is not None and self.config.embed_init_std is not None:
-                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
-        if isinstance(module, nn.Linear):
-            if self.config is not None and self.config.init_std is not None:
-                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
-                if hasattr(module, "bias") and module.bias is not None:
-                    nn.init.constant_(module.bias, 0.0)
-        if isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-XLM_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XLM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        langs (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            A parallel sequence of tokens to be used to indicate the language of each token in the input.
-            Indices are languages ids which can be obtained from the language names by using two conversion mappings
-            provided in the configuration of the model (only provided for multilingual models).
-            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
-            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
-
-            See usage examples detailed in the `multilingual documentation <https://huggingface.co/transformers/multilingual.html>`__.
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Length of each sentence that can be used to avoid performing attention on padding token indices.
-            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
-            Indices selected in ``[0, ..., input_ids.size(-1)]``:
-        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`):
-            dictionary with ``torch.FloatTensor`` that contains pre-computed
-            hidden-states (key and values in the attention blocks) as computed by the model
-            (see `cache` output below). Can be used to speed up sequential decoding.
-            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
-    XLM_START_DOCSTRING,
-)
-class XLMModel(XLMPreTrainedModel):
-    def __init__(self, config):  # , dico, is_encoder, with_output):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        # encoder / decoder, output layer
-        self.is_encoder = config.is_encoder
-        self.is_decoder = not config.is_encoder
-        if self.is_decoder:
-            raise NotImplementedError("Currently XLM can only be used as an encoder")
-        # self.with_output = with_output
-        self.causal = config.causal
-
-        # dictionary / languages
-        self.n_langs = config.n_langs
-        self.use_lang_emb = config.use_lang_emb
-        self.n_words = config.n_words
-        self.eos_index = config.eos_index
-        self.pad_index = config.pad_index
-        # self.dico = dico
-        # self.id2lang = config.id2lang
-        # self.lang2id = config.lang2id
-        # assert len(self.dico) == self.n_words
-        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
-
-        # model parameters
-        self.dim = config.emb_dim  # 512 by default
-        self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = config.n_heads  # 8 by default
-        self.n_layers = config.n_layers
-        self.dropout = config.dropout
-        self.attention_dropout = config.attention_dropout
-        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
-
-        # embeddings
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
-        if config.sinusoidal_embeddings:
-            create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
-        if config.n_langs > 1 and config.use_lang_emb:
-            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
-        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
-        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
-
-        # transformer layers
-        self.attentions = nn.ModuleList()
-        self.layer_norm1 = nn.ModuleList()
-        self.ffns = nn.ModuleList()
-        self.layer_norm2 = nn.ModuleList()
-        # if self.is_decoder:
-        #     self.layer_norm15 = nn.ModuleList()
-        #     self.encoder_attn = nn.ModuleList()
-
-        for _ in range(self.n_layers):
-            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
-            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
-            # if self.is_decoder:
-            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
-            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
-            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
-
-        if hasattr(config, "pruned_heads"):
-            pruned_heads = config.pruned_heads.copy().items()
-            config.pruned_heads = {}
-            for layer, heads in pruned_heads:
-                if self.attentions[int(layer)].n_heads == config.n_heads:
-                    self.prune_heads({int(layer): list(map(int, heads))})
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embeddings = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.attentions[layer].prune_heads(heads)
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        if input_ids is not None:
-            bs, slen = input_ids.size()
-        else:
-            bs, slen = inputs_embeds.size()[:-1]
-
-        if lengths is None:
-            if input_ids is not None:
-                lengths = (input_ids != self.pad_index).sum(dim=1).long()
-            else:
-                lengths = torch.LongTensor([slen] * bs)
-        # mask = input_ids != self.pad_index
-
-        # check inputs
-        assert lengths.size(0) == bs
-        assert lengths.max().item() <= slen
-        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
-        # assert (src_enc is None) == (src_len is None)
-        # if src_enc is not None:
-        #     assert self.is_decoder
-        #     assert src_enc.size(0) == bs
-
-        # generate masks
-        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
-        # if self.is_decoder and src_enc is not None:
-        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # position_ids
-        if position_ids is None:
-            position_ids = torch.arange(slen, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand((bs, slen))
-        else:
-            assert position_ids.size() == (bs, slen)  # (slen, bs)
-            # position_ids = position_ids.transpose(0, 1)
-
-        # langs
-        if langs is not None:
-            assert langs.size() == (bs, slen)  # (slen, bs)
-            # langs = langs.transpose(0, 1)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.n_layers
-
-        # do not recompute cached elements
-        if cache is not None and input_ids is not None:
-            _slen = slen - cache["slen"]
-            input_ids = input_ids[:, -_slen:]
-            position_ids = position_ids[:, -_slen:]
-            if langs is not None:
-                langs = langs[:, -_slen:]
-            mask = mask[:, -_slen:]
-            attn_mask = attn_mask[:, -_slen:]
-
-        # embeddings
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)
-
-        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
-        if langs is not None and self.use_lang_emb:
-            tensor = tensor + self.lang_embeddings(langs)
-        if token_type_ids is not None:
-            tensor = tensor + self.embeddings(token_type_ids)
-        tensor = self.layer_norm_emb(tensor)
-        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
-        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
-
-        # transformer layers
-        hidden_states = ()
-        attentions = ()
-        for i in range(self.n_layers):
-            if self.output_hidden_states:
-                hidden_states = hidden_states + (tensor,)
-
-            # self attention
-            attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
-            attn = attn_outputs[0]
-            if self.output_attentions:
-                attentions = attentions + (attn_outputs[1],)
-            attn = F.dropout(attn, p=self.dropout, training=self.training)
-            tensor = tensor + attn
-            tensor = self.layer_norm1[i](tensor)
-
-            # encoder attention (for decoder only)
-            # if self.is_decoder and src_enc is not None:
-            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
-            #     tensor = tensor + attn
-            #     tensor = self.layer_norm15[i](tensor)
-
-            # FFN
-            tensor = tensor + self.ffns[i](tensor)
-            tensor = self.layer_norm2[i](tensor)
-            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
-
-        # Add last hidden state
-        if self.output_hidden_states:
-            hidden_states = hidden_states + (tensor,)
-
-        # update cache length
-        if cache is not None:
-            cache["slen"] += tensor.size(1)
-
-        # move back sequence length to dimension 0
-        # tensor = tensor.transpose(0, 1)
-
-        outputs = (tensor,)
-        if self.output_hidden_states:
-            outputs = outputs + (hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (attentions,)
-        return outputs  # outputs, (hidden_states), (attentions)
-
-
-class XLMPredLayer(nn.Module):
-    """
-    Prediction layer (cross_entropy or adaptive_softmax).
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.asm = config.asm
-        self.n_words = config.n_words
-        self.pad_index = config.pad_index
-        dim = config.emb_dim
-
-        if config.asm is False:
-            self.proj = nn.Linear(dim, config.n_words, bias=True)
-        else:
-            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
-                in_features=dim,
-                n_classes=config.n_words,
-                cutoffs=config.asm_cutoffs,
-                div_value=config.asm_div_value,
-                head_bias=True,  # default is False
-            )
-
-    def forward(self, x, y=None):
-        """ Compute the loss, and optionally the scores.
-        """
-        outputs = ()
-        if self.asm is False:
-            scores = self.proj(x)
-            outputs = (scores,) + outputs
-            if y is not None:
-                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean")
-                outputs = (loss,) + outputs
-        else:
-            scores = self.proj.log_prob(x)
-            outputs = (scores,) + outputs
-            if y is not None:
-                _, loss = self.proj(x, y)
-                outputs = (loss,) + outputs
-
-        return outputs
-
-
-@add_start_docstrings(
-    """The XLM Model transformer with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    XLM_START_DOCSTRING,
-)
-class XLMWithLMHeadModel(XLMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = XLMModel(config)
-        self.pred_layer = XLMPredLayer(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.pred_layer.proj
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        mask_token_id = self.config.mask_token_id
-        lang_id = self.config.lang_id
-
-        effective_batch_size = input_ids.shape[0]
-        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
-        input_ids = torch.cat([input_ids, mask_token], dim=1)
-        if lang_id is not None:
-            langs = torch.full_like(input_ids, lang_id)
-        else:
-            langs = None
-        return {"input_ids": input_ids, "langs": langs}
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMWithLMHeadModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            langs=langs,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            lengths=lengths,
-            cache=cache,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        output = transformer_outputs[0]
-        outputs = self.pred_layer(output, labels)
-        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        return outputs
-
-
-@add_start_docstrings(
-    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XLM_START_DOCSTRING,
-)
-class XLMForSequenceClassification(XLMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XLMModel(config)
-        self.sequence_summary = SequenceSummary(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForSequenceClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            langs=langs,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            lengths=lengths,
-            cache=cache,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        output = transformer_outputs[0]
-        logits = self.sequence_summary(output)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs
-
-
-@add_start_docstrings(
-    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING,
-)
-class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = XLMModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            langs=langs,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            lengths=lengths,
-            cache=cache,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = transformer_outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (
-            start_logits,
-            end_logits,
-        )
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        return outputs
-
-
-@add_start_docstrings(
-    """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLM_START_DOCSTRING,
-)
-class XLMForQuestionAnswering(XLMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = XLMModel(config)
-        self.qa_outputs = SQuADHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        langs=None,
-        token_type_ids=None,
-        position_ids=None,
-        lengths=None,
-        cache=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        is_impossible=None,
-        cls_index=None,
-        p_mask=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the ``is_impossible`` label of the answers.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForQuestionAnswering
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            langs=langs,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            lengths=lengths,
-            cache=cache,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        output = transformer_outputs[0]
-
-        outputs = self.qa_outputs(
-            output,
-            start_positions=start_positions,
-            end_positions=end_positions,
-            cls_index=cls_index,
-            is_impossible=is_impossible,
-            p_mask=p_mask,
-        )
-
-        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        return outputs
diff --git a/server/transformers/src/transformers/modeling_xlm_roberta.py b/server/transformers/src/transformers/modeling_xlm_roberta.py
deleted file mode 100644
index c00a2eb4f5dc283315bd32fe1913f1c84405d089..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_xlm_roberta.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# coding=utf-8
-# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch XLM-RoBERTa model. """
-
-
-import logging
-
-from .configuration_xlm_roberta import XLMRobertaConfig
-from .file_utils import add_start_docstrings
-from .modeling_roberta import (
-    RobertaForMaskedLM,
-    RobertaForMultipleChoice,
-    RobertaForSequenceClassification,
-    RobertaForTokenClassification,
-    RobertaModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
-    "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
-    "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
-    "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
-    "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
-    "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
-}
-
-
-XLM_ROBERTA_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaModel(RobertaModel):
-    """
-    This class overrides :class:`~transformers.RobertaModel`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaForMaskedLM(RobertaForMaskedLM):
-    """
-    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
-    on top of the pooled output) e.g. for GLUE tasks. """,
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
-    """
-    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
-    """
-    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@add_start_docstrings(
-    """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaForTokenClassification(RobertaForTokenClassification):
-    """
-    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
-    superclass for the appropriate documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/server/transformers/src/transformers/modeling_xlnet.py b/server/transformers/src/transformers/modeling_xlnet.py
deleted file mode 100644
index 2720c848914faace9b4700ca8d47a83e4451c8f9..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/modeling_xlnet.py
+++ /dev/null
@@ -1,1682 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch XLNet model.
-"""
-
-
-import logging
-import math
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.nn import functional as F
-
-from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
-
-
-logger = logging.getLogger(__name__)
-
-XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
-    "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
-}
-
-
-def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
-    """ A map of modules from TF to PyTorch.
-        I use a map to keep the PyTorch model as
-        identical to the original PyTorch model as possible.
-    """
-
-    tf_to_pt_map = {}
-
-    if hasattr(model, "transformer"):
-        if hasattr(model, "lm_loss"):
-            # We will load also the output bias
-            tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
-        if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
-            # We will load also the sequence summary
-            tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
-            tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
-        if (
-            hasattr(model, "logits_proj")
-            and config.finetuning_task is not None
-            and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights
-        ):
-            tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight
-            tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias
-
-        # Now load the rest of the transformer
-        model = model.transformer
-
-    # Embeddings and output
-    tf_to_pt_map.update(
-        {
-            "model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
-            "model/transformer/mask_emb/mask_emb": model.mask_emb,
-        }
-    )
-
-    # Transformer blocks
-    for i, b in enumerate(model.layer):
-        layer_str = "model/transformer/layer_%d/" % i
-        tf_to_pt_map.update(
-            {
-                layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
-                layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
-                layer_str + "rel_attn/o/kernel": b.rel_attn.o,
-                layer_str + "rel_attn/q/kernel": b.rel_attn.q,
-                layer_str + "rel_attn/k/kernel": b.rel_attn.k,
-                layer_str + "rel_attn/r/kernel": b.rel_attn.r,
-                layer_str + "rel_attn/v/kernel": b.rel_attn.v,
-                layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
-                layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
-                layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
-                layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
-                layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
-                layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
-            }
-        )
-
-    # Relative positioning biases
-    if config.untie_r:
-        r_r_list = []
-        r_w_list = []
-        r_s_list = []
-        seg_embed_list = []
-        for b in model.layer:
-            r_r_list.append(b.rel_attn.r_r_bias)
-            r_w_list.append(b.rel_attn.r_w_bias)
-            r_s_list.append(b.rel_attn.r_s_bias)
-            seg_embed_list.append(b.rel_attn.seg_embed)
-    else:
-        r_r_list = [model.r_r_bias]
-        r_w_list = [model.r_w_bias]
-        r_s_list = [model.r_s_bias]
-        seg_embed_list = [model.seg_embed]
-    tf_to_pt_map.update(
-        {
-            "model/transformer/r_r_bias": r_r_list,
-            "model/transformer/r_w_bias": r_w_list,
-            "model/transformer/r_s_bias": r_s_list,
-            "model/transformer/seg_embed": seg_embed_list,
-        }
-    )
-    return tf_to_pt_map
-
-
-def load_tf_weights_in_xlnet(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        tf_weights[name] = array
-
-    # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
-
-    for name, pointer in tf_to_pt_map.items():
-        logger.info("Importing {}".format(name))
-        if name not in tf_weights:
-            logger.info("{} not in tf pre-trained weights, skipping".format(name))
-            continue
-        array = tf_weights[name]
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
-            logger.info("Transposing")
-            array = np.transpose(array)
-        if isinstance(pointer, list):
-            # Here we will split the TF weigths
-            assert len(pointer) == array.shape[0]
-            for i, p_i in enumerate(pointer):
-                arr_i = array[i, ...]
-                try:
-                    assert p_i.shape == arr_i.shape
-                except AssertionError as e:
-                    e.args += (p_i.shape, arr_i.shape)
-                    raise
-                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
-                p_i.data = torch.from_numpy(arr_i)
-        else:
-            try:
-                assert pointer.shape == array.shape
-            except AssertionError as e:
-                e.args += (pointer.shape, array.shape)
-                raise
-            logger.info("Initialize PyTorch weight {}".format(name))
-            pointer.data = torch.from_numpy(array)
-        tf_weights.pop(name, None)
-        tf_weights.pop(name + "/Adam", None)
-        tf_weights.pop(name + "/Adam_1", None)
-
-    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
-    return model
-
-
-def gelu(x):
-    """ Implementation of the gelu activation function.
-        XLNet is using OpenAI GPT's gelu (not exactly the same as BERT)
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    return x * cdf
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
-
-
-XLNetLayerNorm = nn.LayerNorm
-
-
-class XLNetRelativeAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-
-        if config.d_model % config.n_head != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.n_head)
-            )
-
-        self.n_head = config.n_head
-        self.d_head = config.d_head
-        self.d_model = config.d_model
-        self.scale = 1 / (config.d_head ** 0.5)
-
-        self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
-        self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
-        self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
-        self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
-        self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
-
-        self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-        self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-        self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-        self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))
-
-        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.dropout)
-
-    def prune_heads(self, heads):
-        raise NotImplementedError
-
-    @staticmethod
-    def rel_shift(x, klen=-1):
-        """perform relative shift to form the relative attention score."""
-        x_size = x.shape
-
-        x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
-        x = x[1:, ...]
-        x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
-        # x = x[:, 0:klen, :, :]
-        x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
-
-        return x
-
-    @staticmethod
-    def rel_shift_bnij(x, klen=-1):
-        x_size = x.shape
-
-        x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])
-        x = x[:, :, 1:, :]
-        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)
-        # Note: the tensor-slice form was faster in my testing than torch.index_select
-        #       However, tracing doesn't like the nature of the slice, and if klen changes
-        #       during the run then it'll fail, whereas index_select will be fine.
-        x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long))
-        # x = x[:, :, :, :klen]
-
-        return x
-
-    def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None):
-        """Core relative positional attention operations."""
-
-        # content based attention score
-        ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
-
-        # position based attention score
-        bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
-        bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
-
-        # segment based attention score
-        if seg_mat is None:
-            ef = 0
-        else:
-            ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
-            ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)
-
-        # merge attention scores and perform masking
-        attn_score = (ac + bd + ef) * self.scale
-        if attn_mask is not None:
-            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
-            if attn_mask.dtype == torch.float16:
-                attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
-            else:
-                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
-
-        # attention probability
-        attn_prob = F.softmax(attn_score, dim=3)
-        attn_prob = self.dropout(attn_prob)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)
-
-        # attention output
-        attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
-
-        if self.output_attentions:
-            return attn_vec, torch.einsum("bnij->ijbn", attn_prob)
-
-        return attn_vec
-
-    def post_attention(self, h, attn_vec, residual=True):
-        """Post-attention processing."""
-        # post-attention projection (back to `d_model`)
-        attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)
-
-        attn_out = self.dropout(attn_out)
-        if residual:
-            attn_out = attn_out + h
-        output = self.layer_norm(attn_out)
-
-        return output
-
-    def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None):
-        if g is not None:
-            # Two-stream attention with relative positional encoding.
-            # content based attention score
-            if mems is not None and mems.dim() > 1:
-                cat = torch.cat([mems, h], dim=0)
-            else:
-                cat = h
-
-            # content-based key head
-            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
-
-            # content-based value head
-            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
-
-            # position-based key head
-            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
-
-            # h-stream
-            # content-stream query head
-            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
-
-            # core attention ops
-            attn_vec_h = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
-            )
-
-            if self.output_attentions:
-                attn_vec_h, attn_prob_h = attn_vec_h
-
-            # post processing
-            output_h = self.post_attention(h, attn_vec_h)
-
-            # g-stream
-            # query-stream query head
-            q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q)
-
-            # core attention ops
-            if target_mapping is not None:
-                q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
-                attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
-                )
-
-                if self.output_attentions:
-                    attn_vec_g, attn_prob_g = attn_vec_g
-
-                attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
-            else:
-                attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask
-                )
-
-                if self.output_attentions:
-                    attn_vec_g, attn_prob_g = attn_vec_g
-
-            # post processing
-            output_g = self.post_attention(g, attn_vec_g)
-
-            if self.output_attentions:
-                attn_prob = attn_prob_h, attn_prob_g
-
-        else:
-            # Multi-head attention with relative positional encoding
-            if mems is not None and mems.dim() > 1:
-                cat = torch.cat([mems, h], dim=0)
-            else:
-                cat = h
-
-            # content heads
-            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
-            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
-            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
-
-            # positional heads
-            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
-
-            # core attention ops
-            attn_vec = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask
-            )
-
-            if self.output_attentions:
-                attn_vec, attn_prob = attn_vec
-
-            # post processing
-            output_h = self.post_attention(h, attn_vec)
-            output_g = None
-
-        outputs = (output_h, output_g)
-        if self.output_attentions:
-            outputs = outputs + (attn_prob,)
-        return outputs
-
-
-class XLNetFeedForward(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
-        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
-        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
-        self.dropout = nn.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str):
-            self.activation_function = ACT2FN[config.ff_activation]
-        else:
-            self.activation_function = config.ff_activation
-
-    def forward(self, inp):
-        output = inp
-        output = self.layer_1(output)
-        output = self.activation_function(output)
-        output = self.dropout(output)
-        output = self.layer_2(output)
-        output = self.dropout(output)
-        output = self.layer_norm(output + inp)
-        return output
-
-
-class XLNetLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.rel_attn = XLNetRelativeAttention(config)
-        self.ff = XLNetFeedForward(config)
-        self.dropout = nn.Dropout(config.dropout)
-
-    def forward(
-        self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None
-    ):
-        outputs = self.rel_attn(
-            output_h,
-            output_g,
-            attn_mask_h,
-            attn_mask_g,
-            r,
-            seg_mat,
-            mems=mems,
-            target_mapping=target_mapping,
-            head_mask=head_mask,
-        )
-        output_h, output_g = outputs[:2]
-
-        if output_g is not None:
-            output_g = self.ff(output_g)
-        output_h = self.ff(output_h)
-
-        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
-        return outputs
-
-
-class XLNetPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XLNetConfig
-    pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_xlnet
-    base_model_prefix = "transformer"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, XLNetLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, XLNetRelativeAttention):
-            for param in [
-                module.q,
-                module.k,
-                module.v,
-                module.o,
-                module.r,
-                module.r_r_bias,
-                module.r_s_bias,
-                module.r_w_bias,
-                module.seg_embed,
-            ]:
-                param.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, XLNetModel):
-            module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
-
-
-XLNET_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XLNET_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
-            given to this model should not be passed as input ids as they have already been computed.
-        perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
-            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
-            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
-            If None, each token attends to all the others (full bidirectional attention).
-            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
-        target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to indicate the output tokens to use.
-            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
-            Only used during pretraining for partial prediction or for sequential decoding (generation).
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        input_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
-            Kept for compatibility with the original code base.
-            You can only uses one of `input_mask` and `attention_mask`
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
-    XLNET_START_DOCSTRING,
-)
-class XLNetModel(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.output_past = config.output_past
-
-        self.mem_len = config.mem_len
-        self.reuse_len = config.reuse_len
-        self.d_model = config.d_model
-        self.same_length = config.same_length
-        self.attn_type = config.attn_type
-        self.bi_data = config.bi_data
-        self.clamp_len = config.clamp_len
-        self.n_layer = config.n_layer
-
-        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
-        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
-        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
-        self.dropout = nn.Dropout(config.dropout)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.word_embedding
-
-    def set_input_embeddings(self, new_embeddings):
-        self.word_embedding = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        raise NotImplementedError
-
-    def create_mask(self, qlen, mlen):
-        """
-        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
-
-        Args:
-            qlen: Sequence length
-            mlen: Mask length
-
-        ::
-
-                  same_length=False:      same_length=True:
-                  <mlen > <  qlen >       <mlen > <  qlen >
-               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
-                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
-            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
-                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
-               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
-
-        """
-        attn_mask = torch.ones([qlen, qlen])
-        mask_up = torch.triu(attn_mask, diagonal=1)
-        attn_mask_pad = torch.zeros([qlen, mlen])
-        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
-        if self.same_length:
-            mask_lo = torch.tril(attn_mask, diagonal=-1)
-            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
-
-        ret = ret.to(next(self.parameters()))
-        return ret
-
-    def cache_mem(self, curr_out, prev_mem):
-        # cache hidden states into memory.
-        if self.reuse_len is not None and self.reuse_len > 0:
-            curr_out = curr_out[: self.reuse_len]
-
-        if prev_mem is None:
-            new_mem = curr_out[-self.mem_len :]
-        else:
-            new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :]
-
-        return new_mem.detach()
-
-    @staticmethod
-    def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
-        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
-        pos_emb = pos_emb[:, None, :]
-
-        if bsz is not None:
-            pos_emb = pos_emb.expand(-1, bsz, -1)
-
-        return pos_emb
-
-    def relative_positional_encoding(self, qlen, klen, bsz=None):
-        # create relative positional encoding.
-        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
-        inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
-
-        if self.attn_type == "bi":
-            # beg, end = klen - 1, -qlen
-            beg, end = klen, -qlen
-        elif self.attn_type == "uni":
-            # beg, end = klen - 1, -1
-            beg, end = klen, -1
-        else:
-            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
-
-        if self.bi_data:
-            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
-            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float)
-
-            if self.clamp_len > 0:
-                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-
-            if bsz is not None:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
-            else:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
-
-            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
-        else:
-            fwd_pos_seq = torch.arange(beg, end, -1.0)
-            if self.clamp_len > 0:
-                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
-
-        pos_emb = pos_emb.to(next(self.parameters()))
-        return pos_emb
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the last layer of the model.
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetModel.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-        """
-        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
-        # but we want a unified interface in the library with the batch size on the first dimension
-        # so we move here the first dimension (batch) to the end
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_ids = input_ids.transpose(0, 1).contiguous()
-            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
-        elif inputs_embeds is not None:
-            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
-            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
-        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
-        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
-        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
-        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
-
-        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
-        klen = mlen + qlen
-
-        dtype_float = next(self.parameters()).dtype
-        device = next(self.parameters()).device
-
-        # Attention mask
-        # causal attention mask
-        if self.attn_type == "uni":
-            attn_mask = self.create_mask(qlen, mlen)
-            attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == "bi":
-            attn_mask = None
-        else:
-            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
-
-        # data mask: input mask & perm mask
-        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
-        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
-        if input_mask is None and attention_mask is not None:
-            input_mask = 1.0 - attention_mask
-        if input_mask is not None and perm_mask is not None:
-            data_mask = input_mask[None] + perm_mask
-        elif input_mask is not None and perm_mask is None:
-            data_mask = input_mask[None]
-        elif input_mask is None and perm_mask is not None:
-            data_mask = perm_mask
-        else:
-            data_mask = None
-
-        if data_mask is not None:
-            # all mems can be attended to
-            if mlen > 0:
-                mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
-                data_mask = torch.cat([mems_mask, data_mask], dim=1)
-            if attn_mask is None:
-                attn_mask = data_mask[:, :, :, None]
-            else:
-                attn_mask += data_mask[:, :, :, None]
-
-        if attn_mask is not None:
-            attn_mask = (attn_mask > 0).to(dtype_float)
-
-        if attn_mask is not None:
-            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
-            if mlen > 0:
-                non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
-            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
-        else:
-            non_tgt_mask = None
-
-        # Word embeddings and prepare h & g hidden states
-        if inputs_embeds is not None:
-            word_emb_k = inputs_embeds
-        else:
-            word_emb_k = self.word_embedding(input_ids)
-        output_h = self.dropout(word_emb_k)
-        if target_mapping is not None:
-            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
-            # else:  # We removed the inp_q input which was same as target mapping
-            #     inp_q_ext = inp_q[:, :, None]
-            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
-            output_g = self.dropout(word_emb_q)
-        else:
-            output_g = None
-
-        # Segment embedding
-        if token_type_ids is not None:
-            # Convert `token_type_ids` to one-hot `seg_mat`
-            if mlen > 0:
-                mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
-                cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
-            else:
-                cat_ids = token_type_ids
-
-            # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
-            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
-        else:
-            seg_mat = None
-
-        # Positional encoding
-        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
-        pos_emb = self.dropout(pos_emb)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
-        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
-                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.n_layer
-
-        new_mems = ()
-        if mems is None:
-            mems = [None] * len(self.layer)
-
-        attentions = []
-        hidden_states = []
-        for i, layer_module in enumerate(self.layer):
-            if self.mem_len is not None and self.mem_len > 0 and self.output_past:
-                # cache new mems
-                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
-            if self.output_hidden_states:
-                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
-
-            outputs = layer_module(
-                output_h,
-                output_g,
-                attn_mask_h=non_tgt_mask,
-                attn_mask_g=attn_mask,
-                r=pos_emb,
-                seg_mat=seg_mat,
-                mems=mems[i],
-                target_mapping=target_mapping,
-                head_mask=head_mask[i],
-            )
-            output_h, output_g = outputs[:2]
-            if self.output_attentions:
-                attentions.append(outputs[2])
-
-        # Add last hidden state
-        if self.output_hidden_states:
-            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
-
-        output = self.dropout(output_g if output_g is not None else output_h)
-
-        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
-        outputs = (output.permute(1, 0, 2).contiguous(),)
-
-        if self.mem_len is not None and self.mem_len > 0 and self.output_past:
-            outputs = outputs + (new_mems,)
-
-        if self.output_hidden_states:
-            if output_g is not None:
-                hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
-            else:
-                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
-            outputs = outputs + (hidden_states,)
-        if self.output_attentions:
-            if target_mapping is not None:
-                # when target_mapping is provided, there are 2-tuple of attentions
-                attentions = tuple(
-                    tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions
-                )
-            else:
-                attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
-            outputs = outputs + (attentions,)
-
-        return outputs  # outputs, (new_mems), (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a language modeling head on top
-    (linear layer with weights tied to the input embeddings). """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetLMHeadModel(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.attn_type = config.attn_type
-        self.same_length = config.same_length
-
-        self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_loss
-
-    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
-        # Add dummy token at the end (no attention on this one)
-
-        effective_batch_size = input_ids.shape[0]
-        dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        # Build permutation mask so that previous tokens don't see last token
-        sequence_length = input_ids.shape[1]
-        perm_mask = torch.zeros(
-            (effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device
-        )
-        perm_mask[:, :, -1] = 1.0
-
-        # We'll only predict the last token
-        target_mapping = torch.zeros(
-            (effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device
-        )
-        target_mapping[0, 0, -1] = 1.0
-
-        inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
-
-        # if past is defined in model kwargs then use it for faster decoding
-        if "past" in model_kwargs and model_kwargs["past"]:
-            inputs["mems"] = model_kwargs["past"]
-
-        return inputs
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
-            Language modeling loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetLMHeadModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
-
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True)).unsqueeze(0)  # We will predict the masked token
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            token_type_ids=token_type_ids,
-            input_mask=input_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        logits = self.lm_loss(transformer_outputs[0])
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        if labels is not None:
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetForSequenceClassification(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XLNetModel(config)
-        self.sequence_summary = SequenceSummary(config)
-        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForSequenceClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            token_type_ids=token_type_ids,
-            input_mask=input_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        output = transformer_outputs[0]
-
-        output = self.sequence_summary(output)
-        logits = self.logits_proj(output)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetForTokenClassification(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XLNetModel(config)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
-            Classification scores (before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForTokenClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        scores = outputs[0]
-
-        """
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            token_type_ids=token_type_ids,
-            input_mask=input_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetForMultipleChoice(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = XLNetModel(config)
-        self.sequence_summary = SequenceSummary(config)
-        self.logits_proj = nn.Linear(config.d_model, 1)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        token_type_ids=None,
-        input_mask=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        labels=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForMultipleChoice
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
-        """
-        num_choices = input_ids.shape[1]
-
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
-
-        transformer_outputs = self.transformer(
-            flat_input_ids,
-            token_type_ids=flat_token_type_ids,
-            input_mask=flat_input_mask,
-            attention_mask=flat_attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        output = transformer_outputs[0]
-
-        output = self.sequence_summary(output)
-        logits = self.logits_proj(output)
-        reshaped_logits = logits.view(-1, num_choices)
-        outputs = (reshaped_logits,) + transformer_outputs[
-            1:
-        ]  # Keep mems, hidden states, attentions if there are in it
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XLNetModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
-        """
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            token_type_ids=token_type_ids,
-            input_mask=input_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING,
-)
-class XLNetForQuestionAnswering(XLNetPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.transformer = XLNetModel(config)
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        mems=None,
-        perm_mask=None,
-        target_mapping=None,
-        token_type_ids=None,
-        input_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        is_impossible=None,
-        cls_index=None,
-        p_mask=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
-
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the ``is_impossible`` label of the answers.
-        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
-            Contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        import torch
-
-        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
-        """
-        transformer_outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            mems=mems,
-            perm_mask=perm_mask,
-            target_mapping=target_mapping,
-            token_type_ids=token_type_ids,
-            input_mask=input_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        hidden_states = transformer_outputs[0]
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            outputs = (total_loss,) + outputs
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum(
-                "blh,bl->bh", hidden_states, start_log_probs
-            )  # get the representation of START as weighted sum of hidden states
-            cls_logits = self.answer_class(
-                hidden_states, start_states=start_states, cls_index=cls_index
-            )  # Shape (batch size,): one single `cls_logits` for each sample
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
-
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        # or (if labels are provided) (total_loss,)
-        return outputs
diff --git a/server/transformers/src/transformers/optimization.py b/server/transformers/src/transformers/optimization.py
deleted file mode 100644
index 5ab7647638e054192b1a122b2121b5c5059ca85d..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/optimization.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for BERT model."""
-
-import logging
-import math
-
-import torch
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_constant_schedule(optimizer, last_epoch=-1):
-    """ Create a schedule with a constant learning rate.
-    """
-    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
-
-
-def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
-    """ Create a schedule with a constant learning rate preceded by a warmup
-    period during which the learning rate increases linearly between 0 and 1.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.0
-
-    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
-
-
-def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
-    """ Create a schedule with a learning rate that decreases linearly after
-    linearly increasing during a warmup period.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
-        )
-
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
-    """ Create a schedule with a learning rate that decreases following the
-    values of the cosine function between 0 and `pi * cycles` after a warmup
-    period during which it increases linearly between 0 and 1.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-def get_cosine_with_hard_restarts_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
-):
-    """ Create a schedule with a learning rate that decreases following the
-    values of the cosine function with several hard restarts, after a warmup
-    period during which it increases linearly between 0 and 1.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
-
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
-
-
-class AdamW(Optimizer):
-    """ Implements Adam algorithm with weight decay fix.
-
-    Parameters:
-        lr (float): learning rate. Default 1e-3.
-        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
-        eps (float): Adams epsilon. Default: 1e-6
-        weight_decay (float): Weight decay. Default: 0.0
-        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
-        if lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
-        super().__init__(params, defaults)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group["eps"])
-
-                step_size = group["lr"]
-                if group["correct_bias"]:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1 ** state["step"]
-                    bias_correction2 = 1.0 - beta2 ** state["step"]
-                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
-
-                p.data.addcdiv_(-step_size, exp_avg, denom)
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                # Add weight decay at the end (fixed version)
-                if group["weight_decay"] > 0.0:
-                    p.data.add_(-group["lr"] * group["weight_decay"], p.data)
-
-        return loss
diff --git a/server/transformers/src/transformers/optimization_tf.py b/server/transformers/src/transformers/optimization_tf.py
deleted file mode 100644
index d232370905e241a5029200f6f4229f6000368623..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/optimization_tf.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions and classes related to optimization (weight updates)."""
-
-
-import re
-
-import tensorflow as tf
-
-
-class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-    """Applys a warmup schedule on a given learning rate decay schedule."""
-
-    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
-        super().__init__()
-        self.initial_learning_rate = initial_learning_rate
-        self.warmup_steps = warmup_steps
-        self.power = power
-        self.decay_schedule_fn = decay_schedule_fn
-        self.name = name
-
-    def __call__(self, step):
-        with tf.name_scope(self.name or "WarmUp") as name:
-            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-            # learning rate will be `global_step/num_warmup_steps * init_lr`.
-            global_step_float = tf.cast(step, tf.float32)
-            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-            warmup_percent_done = global_step_float / warmup_steps_float
-            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
-            return tf.cond(
-                global_step_float < warmup_steps_float,
-                lambda: warmup_learning_rate,
-                lambda: self.decay_schedule_fn(step),
-                name=name,
-            )
-
-    def get_config(self):
-        return {
-            "initial_learning_rate": self.initial_learning_rate,
-            "decay_schedule_fn": self.decay_schedule_fn,
-            "warmup_steps": self.warmup_steps,
-            "power": self.power,
-            "name": self.name,
-        }
-
-
-def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
-    """Creates an optimizer with learning rate schedule."""
-    # Implements linear decay of the learning rate.
-    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0
-    )
-    if num_warmup_steps:
-        learning_rate_fn = WarmUp(
-            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
-        )
-    optimizer = AdamWeightDecay(
-        learning_rate=learning_rate_fn,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=["layer_norm", "bias"],
-    )
-    return optimizer
-
-
-class AdamWeightDecay(tf.keras.optimizers.Adam):
-    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
-
-  Just adding the square of the weights to the loss function is *not* the
-  correct way of using L2 regularization/weight decay with Adam, since that will
-  interact with the m and v parameters in strange ways.
-
-  Instead we want ot decay the weights in a manner that doesn't interact with
-  the m/v parameters. This is equivalent to adding the square of the weights to
-  the loss with plain (non-momentum) SGD.
-  """
-
-    def __init__(
-        self,
-        learning_rate=0.001,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-7,
-        amsgrad=False,
-        weight_decay_rate=0.0,
-        include_in_weight_decay=None,
-        exclude_from_weight_decay=None,
-        name="AdamWeightDecay",
-        **kwargs
-    ):
-        super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
-        self.weight_decay_rate = weight_decay_rate
-        self._include_in_weight_decay = include_in_weight_decay
-        self._exclude_from_weight_decay = exclude_from_weight_decay
-
-    @classmethod
-    def from_config(cls, config):
-        """Creates an optimizer from its config with WarmUp custom object."""
-        custom_objects = {"WarmUp": WarmUp}
-        return super().from_config(config, custom_objects=custom_objects)
-
-    def _prepare_local(self, var_device, var_dtype, apply_state):
-        super()._prepare_local(var_device, var_dtype, apply_state)
-        apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
-
-    def _decay_weights_op(self, var, learning_rate, apply_state):
-        do_decay = self._do_use_weight_decay(var.name)
-        if do_decay:
-            return var.assign_sub(
-                learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
-            )
-        return tf.no_op()
-
-    def apply_gradients(self, grads_and_vars, clip_norm, name=None):
-        grads, tvars = list(zip(*grads_and_vars))
-        (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
-        return super().apply_gradients(zip(grads, tvars))
-
-    def _get_lr(self, var_device, var_dtype, apply_state):
-        """Retrieves the learning rate with the given state."""
-        if apply_state is None:
-            return self._decayed_lr_t[var_dtype], {}
-
-        apply_state = apply_state or {}
-        coefficients = apply_state.get((var_device, var_dtype))
-        if coefficients is None:
-            coefficients = self._fallback_apply_state(var_device, var_dtype)
-            apply_state[(var_device, var_dtype)] = coefficients
-
-        return coefficients["lr_t"], dict(apply_state=apply_state)
-
-    def _resource_apply_dense(self, grad, var, apply_state=None):
-        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-        decay = self._decay_weights_op(var, lr_t, apply_state)
-        with tf.control_dependencies([decay]):
-            return super()._resource_apply_dense(grad, var, **kwargs)
-
-    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-        decay = self._decay_weights_op(var, lr_t, apply_state)
-        with tf.control_dependencies([decay]):
-            return super()._resource_apply_sparse(grad, var, indices, **kwargs)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update({"weight_decay_rate": self.weight_decay_rate})
-        return config
-
-    def _do_use_weight_decay(self, param_name):
-        """Whether to use L2 weight decay for `param_name`."""
-        if self.weight_decay_rate == 0:
-            return False
-
-        if self._include_in_weight_decay:
-            for r in self._include_in_weight_decay:
-                if re.search(r, param_name) is not None:
-                    return True
-
-        if self._exclude_from_weight_decay:
-            for r in self._exclude_from_weight_decay:
-                if re.search(r, param_name) is not None:
-                    return False
-        return True
-
-
-# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
-class GradientAccumulator(object):
-    """Distribution strategies-aware gradient accumulation utility."""
-
-    def __init__(self):
-        """Initializes the accumulator."""
-        self._gradients = []
-        self._accum_steps = tf.Variable(
-            initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
-        )
-
-    @property
-    def step(self):
-        """Number of accumulated steps."""
-        return self._accum_steps.value()
-
-    @property
-    def gradients(self):
-        """The accumulated gradients."""
-        return list(
-            gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
-        )
-
-    def __call__(self, gradients):
-        """Accumulates :obj:`gradients`."""
-        if not self._gradients:
-            self._gradients.extend(
-                [
-                    tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
-                    for gradient in gradients
-                ]
-            )
-
-        if len(gradients) != len(self._gradients):
-            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
-
-        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
-            if accum_gradient is not None:
-                accum_gradient.assign_add(gradient)
-
-        self._accum_steps.assign_add(1)
-
-    def reset(self):
-        """Resets the accumulated gradients."""
-        if self._gradients:
-            self._accum_steps.assign(0)
-
-        for gradient in self._get_replica_gradients():
-            if gradient is not None:
-                gradient.assign(tf.zeros_like(gradient))
-
-    def _get_replica_gradients(self):
-        if tf.distribute.has_strategy():
-            # In a replica context, we want to accumulate gradients on each replica
-            # without synchronization, so we directly assign the value of the
-            # current replica.
-            replica_context = tf.distribute.get_replica_context()
-
-            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
-                return self._gradients
-
-            return (
-                gradient.device_map.select_for_current_replica(gradient.values, replica_context)
-                for gradient in self._gradients
-            )
-        else:
-            return self._gradients
diff --git a/server/transformers/src/transformers/pipelines.py b/server/transformers/src/transformers/pipelines.py
deleted file mode 100755
index d694afbaa5d9cb7cad87484c510d9dee0c73f5d0..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/pipelines.py
+++ /dev/null
@@ -1,1087 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import csv
-import json
-import logging
-import os
-import pickle
-import sys
-from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from os.path import abspath, exists
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-
-from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
-from .configuration_distilbert import DistilBertConfig
-from .configuration_roberta import RobertaConfig
-from .configuration_utils import PretrainedConfig
-from .configuration_xlm import XLMConfig
-from .data import SquadExample, squad_convert_examples_to_features
-from .file_utils import is_tf_available, is_torch_available
-from .modelcard import ModelCard
-from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BasicTokenizer
-from .tokenization_utils import PreTrainedTokenizer
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from .modeling_tf_auto import (
-        TFAutoModel,
-        TFAutoModelForSequenceClassification,
-        TFAutoModelForQuestionAnswering,
-        TFAutoModelForTokenClassification,
-        TFAutoModelWithLMHead,
-    )
-
-if is_torch_available():
-    import torch
-    from .modeling_auto import (
-        AutoModel,
-        AutoModelForSequenceClassification,
-        AutoModelForQuestionAnswering,
-        AutoModelForTokenClassification,
-        AutoModelWithLMHead,
-    )
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_framework(model=None):
-    """ Select framework (TensorFlow/PyTorch) to use.
-        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
-    """
-    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
-        # Both framework are available but the user supplied a model class instance.
-        # Try to guess which framework to use from the model classname
-        framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
-    elif not is_tf_available() and not is_torch_available():
-        raise RuntimeError(
-            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
-            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
-            "To install PyTorch, read the instructions at https://pytorch.org/."
-        )
-    else:
-        # framework = 'tf' if is_tf_available() else 'pt'
-        framework = "pt" if is_torch_available() else "tf"
-    return framework
-
-
-class ArgumentHandler(ABC):
-    """
-    Base interface for handling varargs for each Pipeline
-    """
-
-    @abstractmethod
-    def __call__(self, *args, **kwargs):
-        raise NotImplementedError()
-
-
-class DefaultArgumentHandler(ArgumentHandler):
-    """
-    Default varargs argument parser handling parameters for each Pipeline
-    """
-
-    def __call__(self, *args, **kwargs):
-        if "X" in kwargs:
-            return kwargs["X"]
-        elif "data" in kwargs:
-            return kwargs["data"]
-        elif len(args) == 1:
-            if isinstance(args[0], list):
-                return args[0]
-            else:
-                return [args[0]]
-        elif len(args) > 1:
-            return list(args)
-        raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)")
-
-
-class PipelineDataFormat:
-    """
-    Base class for all the pipeline supported data format both for reading and writing.
-    Supported data formats currently includes:
-     - JSON
-     - CSV
-     - stdin/stdout (pipe)
-
-    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
-    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
-    """
-
-    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
-
-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
-        self.output_path = output_path
-        self.input_path = input_path
-        self.column = column.split(",") if column is not None else [""]
-        self.is_multi_columns = len(self.column) > 1
-
-        if self.is_multi_columns:
-            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
-
-        if output_path is not None and not overwrite:
-            if exists(abspath(self.output_path)):
-                raise OSError("{} already exists on disk".format(self.output_path))
-
-        if input_path is not None:
-            if not exists(abspath(self.input_path)):
-                raise OSError("{} doesnt exist on disk".format(self.input_path))
-
-    @abstractmethod
-    def __iter__(self):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def save(self, data: dict):
-        """
-        Save the provided data object with the representation for the current `DataFormat`.
-        :param data: data to store
-        :return:
-        """
-        raise NotImplementedError()
-
-    def save_binary(self, data: Union[dict, List[dict]]) -> str:
-        """
-        Save the provided data object as a pickle-formatted binary data on the disk.
-        :param data: data to store
-        :return: (str) Path where the data has been saved
-        """
-        path, _ = os.path.splitext(self.output_path)
-        binary_path = os.path.extsep.join((path, "pickle"))
-
-        with open(binary_path, "wb+") as f_output:
-            pickle.dump(data, f_output)
-
-        return binary_path
-
-    @staticmethod
-    def from_str(
-        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False
-    ):
-        if format == "json":
-            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == "csv":
-            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        elif format == "pipe":
-            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
-        else:
-            raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format))
-
-
-class CsvPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
-        super().__init__(output_path, input_path, column, overwrite=overwrite)
-
-    def __iter__(self):
-        with open(self.input_path, "r") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                if self.is_multi_columns:
-                    yield {k: row[c] for k, c in self.column}
-                else:
-                    yield row[self.column[0]]
-
-    def save(self, data: List[dict]):
-        with open(self.output_path, "w") as f:
-            if len(data) > 0:
-                writer = csv.DictWriter(f, list(data[0].keys()))
-                writer.writeheader()
-                writer.writerows(data)
-
-
-class JsonPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
-        super().__init__(output_path, input_path, column, overwrite=overwrite)
-
-        with open(input_path, "r") as f:
-            self._entries = json.load(f)
-
-    def __iter__(self):
-        for entry in self._entries:
-            if self.is_multi_columns:
-                yield {k: entry[c] for k, c in self.column}
-            else:
-                yield entry[self.column[0]]
-
-    def save(self, data: dict):
-        with open(self.output_path, "w") as f:
-            json.dump(data, f)
-
-
-class PipedPipelineDataFormat(PipelineDataFormat):
-    """
-    Read data from piped input to the python process.
-    For multi columns data, columns should separated by \t
-
-    If columns are provided, then the output will be a dictionary with {column_x: value_x}
-    """
-
-    def __iter__(self):
-        for line in sys.stdin:
-            # Split for multi-columns
-            if "\t" in line:
-
-                line = line.split("\t")
-                if self.column:
-                    # Dictionary to map arguments
-                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
-                else:
-                    yield tuple(line)
-
-            # No dictionary to map arguments
-            else:
-                yield line
-
-    def save(self, data: dict):
-        print(data)
-
-    def save_binary(self, data: Union[dict, List[dict]]) -> str:
-        if self.output_path is None:
-            raise KeyError(
-                "When using piped input on pipeline outputting large object requires an output file path. "
-                "Please provide such output path through --output argument."
-            )
-
-        return super().save_binary(data)
-
-
-class _ScikitCompat(ABC):
-    """
-    Interface layer for the Scikit and Keras compatibility.
-    """
-
-    @abstractmethod
-    def transform(self, X):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def predict(self, X):
-        raise NotImplementedError()
-
-
-class Pipeline(_ScikitCompat):
-    """
-    Base class implementing pipelined operations.
-    Pipeline workflow is defined as a sequence of the following operations:
-        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
-
-    Pipeline supports running on CPU or GPU through the device argument. Users can specify
-    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.
-
-    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
-    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
-    provide the binary_output constructor argument. If set to True, the output will be stored in the
-    pickle format.
-
-    Arguments:
-        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
-            Reference to the model to use through this pipeline.
-
-        **tokenizer**: ``(str, PreTrainedTokenizer)``:
-            Reference to the tokenizer to use through this pipeline.
-
-        **args_parser**: ``ArgumentHandler``:
-            Reference to the object in charge of parsing supplied pipeline parameters.
-
-        **device**: ``int``:
-            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
-            on the associated CUDA device id.
-
-        **binary_output** ``bool`` (default: False):
-            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
-
-    Return:
-        Pipeline returns list or dictionary depending on:
-         - Does the user provided multiple sample
-         - The pipeline expose multiple fields in the output object
-
-    Examples:
-        nlp = pipeline('ner')
-        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
-        nlp = NerPipeline(model='...', config='...', tokenizer='...')
-        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
-    """
-
-    default_input_names = None
-
-    def __init__(
-        self,
-        model,
-        tokenizer: PreTrainedTokenizer = None,
-        modelcard: ModelCard = None,
-        framework: Optional[str] = None,
-        args_parser: ArgumentHandler = None,
-        device: int = -1,
-        binary_output: bool = False,
-    ):
-
-        if framework is None:
-            framework = get_framework()
-
-        self.model = model
-        self.tokenizer = tokenizer
-        self.modelcard = modelcard
-        self.framework = framework
-        self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device))
-        self.binary_output = binary_output
-        self._args_parser = args_parser or DefaultArgumentHandler()
-
-        # Special handling
-        if self.framework == "pt" and self.device.type == "cuda":
-            self.model = self.model.to(self.device)
-
-    def save_pretrained(self, save_directory):
-        """
-        Save the pipeline's model and tokenizer to the specified save_directory
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Provided path ({}) should be a directory".format(save_directory))
-            return
-
-        self.model.save_pretrained(save_directory)
-        self.tokenizer.save_pretrained(save_directory)
-        self.modelcard.save_pretrained(save_directory)
-
-    def transform(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
-        """
-        return self(X=X)
-
-    def predict(self, X):
-        """
-        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
-        """
-        return self(X=X)
-
-    @contextmanager
-    def device_placement(self):
-        """
-        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
-        example:
-            # Explicitly ask for tensor allocation on CUDA device :0
-            nlp = pipeline(..., device=0)
-            with nlp.device_placement():
-                # Every framework specific tensor allocation will be done on the request device
-                output = nlp(...)
-        Returns:
-            Context manager
-        """
-        if self.framework == "tf":
-            with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)):
-                yield
-        else:
-            if self.device.type == "cuda":
-                torch.cuda.set_device(self.device)
-
-            yield
-
-    def ensure_tensor_on_device(self, **inputs):
-        """
-        Ensure PyTorch tensors are on the specified device.
-        :param inputs:
-        :return:
-        """
-        return {name: tensor.to(self.device) for name, tensor in inputs.items()}
-
-    def inputs_for_model(self, features: Union[dict, List[dict]]) -> Dict:
-        """
-        Generates the input dictionary with model-specific parameters.
-
-        Returns:
-            dict holding all the required parameters for model's forward
-        """
-        args = ["input_ids", "attention_mask"]
-
-        if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)):
-            args += ["token_type_ids"]
-
-        # PR #1548 (CLI) There is an issue with attention_mask
-        # if 'xlnet' in model_type or 'xlm' in model_type:
-        #     args += ['cls_index', 'p_mask']
-
-        if isinstance(features, dict):
-            return {k: features[k] for k in args}
-        else:
-            return {k: [feature[k] for feature in features] for k in args}
-
-    def _parse_and_tokenize(self, *texts, **kwargs):
-        """
-        Parse arguments and tokenize
-        """
-        # Parse arguments
-        inputs = self._args_parser(*texts, **kwargs)
-        inputs = self.tokenizer.batch_encode_plus(
-            inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len
-        )
-
-        # Filter out features not available on specific models
-        inputs = self.inputs_for_model(inputs)
-
-        return inputs
-
-    def __call__(self, *texts, **kwargs):
-        inputs = self._parse_and_tokenize(*texts, **kwargs)
-        return self._forward(inputs)
-
-    def _forward(self, inputs, return_tensors=False):
-        """
-        Internal framework specific forward dispatching.
-        Args:
-            inputs: dict holding all the keyworded arguments for required by the model forward method.
-            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
-        Returns:
-            Numpy array
-        """
-        # Encode for forward
-        with self.device_placement():
-            if self.framework == "tf":
-                # TODO trace model
-                predictions = self.model(inputs, training=False)[0]
-            else:
-                with torch.no_grad():
-                    inputs = self.ensure_tensor_on_device(**inputs)
-                    predictions = self.model(**inputs)[0].cpu()
-
-        if return_tensors:
-            return predictions
-        else:
-            return predictions.numpy()
-
-
-class FeatureExtractionPipeline(Pipeline):
-    """
-    Feature extraction pipeline using Model head.
-    """
-
-    def __init__(
-        self,
-        model,
-        tokenizer: PreTrainedTokenizer = None,
-        modelcard: ModelCard = None,
-        framework: Optional[str] = None,
-        args_parser: ArgumentHandler = None,
-        device: int = -1,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            modelcard=modelcard,
-            framework=framework,
-            args_parser=args_parser,
-            device=device,
-            binary_output=True,
-        )
-
-    def __call__(self, *args, **kwargs):
-        return super().__call__(*args, **kwargs).tolist()
-
-
-class TextClassificationPipeline(Pipeline):
-    """
-    Text classification pipeline using ModelForTextClassification head.
-    """
-
-    def __call__(self, *args, **kwargs):
-        outputs = super().__call__(*args, **kwargs)
-        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
-        return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
-
-
-class FillMaskPipeline(Pipeline):
-    """
-    Masked language modeling prediction pipeline using ModelWithLMHead head.
-    """
-
-    def __init__(
-        self,
-        model,
-        tokenizer: PreTrainedTokenizer = None,
-        modelcard: ModelCard = None,
-        framework: Optional[str] = None,
-        args_parser: ArgumentHandler = None,
-        device: int = -1,
-        topk=5,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            modelcard=modelcard,
-            framework=framework,
-            args_parser=args_parser,
-            device=device,
-            binary_output=True,
-        )
-
-        self.topk = topk
-
-    def __call__(self, *args, **kwargs):
-        inputs = self._parse_and_tokenize(*args, **kwargs)
-        outputs = self._forward(inputs, return_tensors=True)
-
-        results = []
-        batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
-
-        for i in range(batch_size):
-            input_ids = inputs["input_ids"][i]
-            result = []
-
-            if self.framework == "tf":
-                masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item()
-                logits = outputs[i, masked_index, :]
-                probs = tf.nn.softmax(logits)
-                topk = tf.math.top_k(probs, k=self.topk)
-                values, predictions = topk.values.numpy(), topk.indices.numpy()
-            else:
-                masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()
-                logits = outputs[i, masked_index, :]
-                probs = logits.softmax(dim=0)
-                values, predictions = probs.topk(self.topk)
-
-            for v, p in zip(values.tolist(), predictions.tolist()):
-                tokens = input_ids.numpy()
-                tokens[masked_index] = p
-                # Filter padding out:
-                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
-                result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p})
-
-            # Append
-            results += [result]
-
-        if len(results) == 1:
-            return results[0]
-        return results
-
-
-class NerPipeline(Pipeline):
-    """
-    Named Entity Recognition pipeline using ModelForTokenClassification head.
-    """
-
-    default_input_names = "sequences"
-
-    def __init__(
-        self,
-        model,
-        tokenizer: PreTrainedTokenizer = None,
-        modelcard: ModelCard = None,
-        framework: Optional[str] = None,
-        args_parser: ArgumentHandler = None,
-        device: int = -1,
-        binary_output: bool = False,
-        ignore_labels=["O"],
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            modelcard=modelcard,
-            framework=framework,
-            args_parser=args_parser,
-            device=device,
-            binary_output=binary_output,
-        )
-
-        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
-        self.ignore_labels = ignore_labels
-
-    def __call__(self, *texts, **kwargs):
-        inputs = self._args_parser(*texts, **kwargs)
-        answers = []
-        for sentence in inputs:
-
-            # Manage correct placement of the tensors
-            with self.device_placement():
-
-                tokens = self.tokenizer.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_tensors=self.framework,
-                    max_length=self.tokenizer.max_len,
-                )
-
-                # Forward
-                if self.framework == "tf":
-                    entities = self.model(tokens)[0][0].numpy()
-                    input_ids = tokens["input_ids"].numpy()[0]
-                else:
-                    with torch.no_grad():
-                        tokens = self.ensure_tensor_on_device(**tokens)
-                        entities = self.model(**tokens)[0][0].cpu().numpy()
-                        input_ids = tokens["input_ids"].cpu().numpy()[0]
-
-            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
-            labels_idx = score.argmax(axis=-1)
-
-            answer = []
-            for idx, label_idx in enumerate(labels_idx):
-                if self.model.config.id2label[label_idx] not in self.ignore_labels:
-                    answer += [
-                        {
-                            "word": self.tokenizer.decode([int(input_ids[idx])]),
-                            "score": score[idx][label_idx].item(),
-                            "entity": self.model.config.id2label[label_idx],
-                        }
-                    ]
-
-            # Append
-            answers += [answer]
-        if len(answers) == 1:
-            return answers[0]
-        return answers
-
-
-class QuestionAnsweringArgumentHandler(ArgumentHandler):
-    """
-    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
-    to internal SquadExample / SquadFeature structures.
-
-    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
-    arguments.
-    """
-
-    def __call__(self, *args, **kwargs):
-        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
-        if args is not None and len(args) > 0:
-            if len(args) == 1:
-                kwargs["X"] = args[0]
-            else:
-                kwargs["X"] = list(args)
-
-        # Generic compatibility with sklearn and Keras
-        # Batched data
-        if "X" in kwargs or "data" in kwargs:
-            inputs = kwargs["X"] if "X" in kwargs else kwargs["data"]
-
-            if isinstance(inputs, dict):
-                inputs = [inputs]
-            else:
-                # Copy to avoid overriding arguments
-                inputs = [i for i in inputs]
-
-            for i, item in enumerate(inputs):
-                if isinstance(item, dict):
-                    if any(k not in item for k in ["question", "context"]):
-                        raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
-
-                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
-
-                elif not isinstance(item, SquadExample):
-                    raise ValueError(
-                        "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format(
-                            "X" if "X" in kwargs else "data"
-                        )
-                    )
-
-            # Tabular input
-        elif "question" in kwargs and "context" in kwargs:
-            if isinstance(kwargs["question"], str):
-                kwargs["question"] = [kwargs["question"]]
-
-            if isinstance(kwargs["context"], str):
-                kwargs["context"] = [kwargs["context"]]
-
-            inputs = [
-                QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"])
-            ]
-        else:
-            raise ValueError("Unknown arguments {}".format(kwargs))
-
-        if not isinstance(inputs, list):
-            inputs = [inputs]
-
-        return inputs
-
-
-class QuestionAnsweringPipeline(Pipeline):
-    """
-    Question Answering pipeline using ModelForQuestionAnswering head.
-    """
-
-    default_input_names = "question,context"
-
-    def __init__(
-        self,
-        model,
-        tokenizer: Optional[PreTrainedTokenizer],
-        modelcard: Optional[ModelCard],
-        framework: Optional[str] = None,
-        device: int = -1,
-        **kwargs
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            modelcard=modelcard,
-            framework=framework,
-            args_parser=QuestionAnsweringArgumentHandler(),
-            device=device,
-            **kwargs,
-        )
-
-    @staticmethod
-    def create_sample(
-        question: Union[str, List[str]], context: Union[str, List[str]]
-    ) -> Union[SquadExample, List[SquadExample]]:
-        """
-        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
-        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
-        We currently support extractive question answering.
-        Arguments:
-             question: (str, List[str]) The question to be ask for the associated context
-             context: (str, List[str]) The context in which we will look for the answer.
-
-        Returns:
-            SquadExample initialized with the corresponding question and context.
-        """
-        if isinstance(question, list):
-            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
-        else:
-            return SquadExample(None, question, context, None, None, None)
-
-    def __call__(self, *texts, **kwargs):
-        """
-        Args:
-            We support multiple use-cases, the following are exclusive:
-            X: sequence of SquadExample
-            data: sequence of SquadExample
-            question: (str, List[str]), batch of question(s) to map along with context
-            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
-        Returns:
-            dict: {'answer': str, 'score": float, 'start": int, "end": int}
-            answer: the textual answer in the intial context
-            score: the score the current answer scored for the model
-            start: the character index in the original string corresponding to the beginning of the answer' span
-            end: the character index in the original string corresponding to the ending of the answer' span
-        """
-        # Set defaults values
-        kwargs.setdefault("topk", 1)
-        kwargs.setdefault("doc_stride", 128)
-        kwargs.setdefault("max_answer_len", 15)
-        kwargs.setdefault("max_seq_len", 384)
-        kwargs.setdefault("max_question_len", 64)
-
-        if kwargs["topk"] < 1:
-            raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"]))
-
-        if kwargs["max_answer_len"] < 1:
-            raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"]))
-
-        # Convert inputs to features
-        examples = self._args_parser(*texts, **kwargs)
-        features_list = [
-            squad_convert_examples_to_features(
-                [example],
-                self.tokenizer,
-                kwargs["max_seq_len"],
-                kwargs["doc_stride"],
-                kwargs["max_question_len"],
-                False,
-            )
-            for example in examples
-        ]
-        all_answers = []
-        for features, example in zip(features_list, examples):
-            fw_args = self.inputs_for_model([f.__dict__ for f in features])
-
-            # Manage tensor allocation on correct device
-            with self.device_placement():
-                if self.framework == "tf":
-                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
-                    start, end = self.model(fw_args)
-                    start, end = start.numpy(), end.numpy()
-                else:
-                    with torch.no_grad():
-                        # Retrieve the score for the context tokens only (removing question tokens)
-                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
-                        start, end = self.model(**fw_args)
-                        start, end = start.cpu().numpy(), end.cpu().numpy()
-
-            answers = []
-            for (feature, start_, end_) in zip(features, start, end):
-                # Normalize logits and spans to retrieve the answer
-                start_ = np.exp(start_) / np.sum(np.exp(start_))
-                end_ = np.exp(end_) / np.sum(np.exp(end_))
-
-                # Mask padding and question
-                start_, end_ = (
-                    start_ * np.abs(np.array(feature.p_mask) - 1),
-                    end_ * np.abs(np.array(feature.p_mask) - 1),
-                )
-
-                # TODO : What happens if not possible
-                # Mask CLS
-                start_[0] = end_[0] = 0
-
-                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
-                char_to_word = np.array(example.char_to_word_offset)
-
-                # Convert the answer (tokens) back to the original text
-                answers += [
-                    {
-                        "score": score.item(),
-                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                        "answer": " ".join(
-                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
-                        ),
-                    }
-                    for s, e, score in zip(starts, ends, scores)
-                ]
-            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
-            all_answers += answers
-
-        if len(all_answers) == 1:
-            return all_answers[0]
-        return all_answers
-
-    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
-        """
-        Take the output of any QuestionAnswering head and will generate probalities for each span to be
-        the actual answer.
-        In addition, it filters out some unwanted/impossible cases like answer len being greater than
-        max_answer_len or answer end position being before the starting position.
-        The method supports output the k-best answer through the topk argument.
-
-        Args:
-            start: numpy array, holding individual start probabilities for each token
-            end: numpy array, holding individual end probabilities for each token
-            topk: int, indicates how many possible answer span(s) to extract from the model's output
-            max_answer_len: int, maximum size of the answer to extract from the model's output
-        """
-        # Ensure we have batch axis
-        if start.ndim == 1:
-            start = start[None]
-
-        if end.ndim == 1:
-            end = end[None]
-
-        # Compute the score of each tuple(start, end) to be the real answer
-        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
-
-        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = np.tril(np.triu(outer), max_answer_len - 1)
-
-        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
-        scores_flat = candidates.flatten()
-        if topk == 1:
-            idx_sort = [np.argmax(scores_flat)]
-        elif len(scores_flat) < topk:
-            idx_sort = np.argsort(-scores_flat)
-        else:
-            idx = np.argpartition(-scores_flat, topk)[0:topk]
-            idx_sort = idx[np.argsort(-scores_flat[idx])]
-
-        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
-        return start, end, candidates[0, start, end]
-
-    def span_to_answer(self, text: str, start: int, end: int):
-        """
-        When decoding from token probalities, this method maps token indexes to actual word in
-        the initial context.
-
-        Args:
-            text: str, the actual context to extract the answer from
-            start: int, starting answer token index
-            end: int, ending answer token index
-
-        Returns:
-            dict: {'answer': str, 'start': int, 'end': int}
-        """
-        words = []
-        token_idx = char_start_idx = char_end_idx = chars_idx = 0
-
-        for i, word in enumerate(text.split(" ")):
-            token = self.tokenizer.tokenize(word)
-
-            # Append words if they are in the span
-            if start <= token_idx <= end:
-                if token_idx == start:
-                    char_start_idx = chars_idx
-
-                if token_idx == end:
-                    char_end_idx = chars_idx + len(word)
-
-                words += [word]
-
-            # Stop if we went over the end of the answer
-            if token_idx > end:
-                break
-
-            # Append the subtokenization length to the running index
-            token_idx += len(token)
-            chars_idx += len(word) + 1
-
-        # Join text with spaces
-        return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)}
-
-
-# Register all the supported task here
-SUPPORTED_TASKS = {
-    "feature-extraction": {
-        "impl": FeatureExtractionPipeline,
-        "tf": TFAutoModel if is_tf_available() else None,
-        "pt": AutoModel if is_torch_available() else None,
-        "default": {
-            "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased"},
-            "config": None,
-            "tokenizer": "distilbert-base-uncased",
-        },
-    },
-    "sentiment-analysis": {
-        "impl": TextClassificationPipeline,
-        "tf": TFAutoModelForSequenceClassification if is_tf_available() else None,
-        "pt": AutoModelForSequenceClassification if is_torch_available() else None,
-        "default": {
-            "model": {
-                "pt": "distilbert-base-uncased-finetuned-sst-2-english",
-                "tf": "distilbert-base-uncased-finetuned-sst-2-english",
-            },
-            "config": "distilbert-base-uncased-finetuned-sst-2-english",
-            "tokenizer": "distilbert-base-uncased",
-        },
-    },
-    "ner": {
-        "impl": NerPipeline,
-        "tf": TFAutoModelForTokenClassification if is_tf_available() else None,
-        "pt": AutoModelForTokenClassification if is_torch_available() else None,
-        "default": {
-            "model": {
-                "pt": "dbmdz/bert-large-cased-finetuned-conll03-english",
-                "tf": "dbmdz/bert-large-cased-finetuned-conll03-english",
-            },
-            "config": "dbmdz/bert-large-cased-finetuned-conll03-english",
-            "tokenizer": "bert-large-cased",
-        },
-    },
-    "question-answering": {
-        "impl": QuestionAnsweringPipeline,
-        "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None,
-        "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
-        "default": {
-            "model": {
-                "pt": "distilbert-base-uncased-distilled-squad",
-                "tf": "distilbert-base-uncased-distilled-squad",
-            },
-            "config": None,
-            "tokenizer": "distilbert-base-uncased",
-        },
-    },
-    "fill-mask": {
-        "impl": FillMaskPipeline,
-        "tf": TFAutoModelWithLMHead if is_tf_available() else None,
-        "pt": AutoModelWithLMHead if is_torch_available() else None,
-        "default": {
-            "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"},
-            "config": None,
-            "tokenizer": "distilroberta-base",
-        },
-    },
-}
-
-
-def pipeline(
-    task: str,
-    model: Optional = None,
-    config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
-    modelcard: Optional[Union[str, ModelCard]] = None,
-    **kwargs
-) -> Pipeline:
-    """
-    Utility factory method to build a pipeline.
-    Pipeline are made of:
-        A Tokenizer instance in charge of mapping raw textual input to token
-        A Model instance
-        Some (optional) post processing for enhancing model's output
-
-    Examples:
-        pipeline('sentiment-analysis')
-        pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased')
-        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
-        pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased')
-        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
-    """
-    # Retrieve the task
-    if task not in SUPPORTED_TASKS:
-        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
-
-    framework = get_framework(model)
-
-    targeted_task = SUPPORTED_TASKS[task]
-    task, model_class = targeted_task["impl"], targeted_task[framework]
-
-    # Use default model/config/tokenizer for the task if no model is provided
-    if model is None:
-        models, config, tokenizer = tuple(targeted_task["default"].values())
-        model = models[framework]
-
-    # Try to infer tokenizer from model or config name (if provided as str)
-    if tokenizer is None:
-        if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            tokenizer = model
-        elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            tokenizer = config
-        else:
-            # Impossible to guest what is the right tokenizer here
-            raise Exception(
-                "Impossible to guess which tokenizer to use. "
-                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
-            )
-
-    # Try to infer modelcard from model or config name (if provided as str)
-    if modelcard is None:
-        # Try to fallback on one of the provided string for model or config (will replace the suffix)
-        if isinstance(model, str):
-            modelcard = model
-        elif isinstance(config, str):
-            modelcard = config
-
-    # Instantiate tokenizer if needed
-    if isinstance(tokenizer, str):
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
-
-    # Instantiate config if needed
-    if isinstance(config, str):
-        config = AutoConfig.from_pretrained(config)
-
-    # Instantiate modelcard if needed
-    if isinstance(modelcard, str):
-        modelcard = ModelCard.from_pretrained(modelcard)
-
-    # Instantiate model if needed
-    if isinstance(model, str):
-        # Handle transparent TF/PT model conversion
-        model_kwargs = {}
-        if framework == "pt" and model.endswith(".h5"):
-            model_kwargs["from_tf"] = True
-            logger.warning(
-                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
-                "Trying to load the model with PyTorch."
-            )
-        elif framework == "tf" and model.endswith(".bin"):
-            model_kwargs["from_pt"] = True
-            logger.warning(
-                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
-                "Trying to load the model with Tensorflow."
-            )
-        model = model_class.from_pretrained(model, config=config, **model_kwargs)
-
-    return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
diff --git a/server/transformers/src/transformers/tokenization_albert.py b/server/transformers/src/transformers/tokenization_albert.py
deleted file mode 100644
index 985f82c6fda167184f259482829c6d0949e10928..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_albert.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization classes for ALBERT model."""
-
-
-import logging
-import os
-import unicodedata
-from shutil import copyfile
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
-        "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
-        "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
-        "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
-        "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
-        "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
-        "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
-        "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "albert-base-v1": 512,
-    "albert-large-v1": 512,
-    "albert-xlarge-v1": 512,
-    "albert-xxlarge-v1": 512,
-    "albert-base-v2": 512,
-    "albert-large-v2": 512,
-    "albert-xlarge-v2": 512,
-    "albert-xxlarge-v2": 512,
-}
-
-SPIECE_UNDERLINE = "▁"
-
-
-class AlbertTokenizer(PreTrainedTokenizer):
-    """
-        SentencePiece based tokenizer. Peculiarities:
-
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        remove_space=True,
-        keep_accents=False,
-        bos_token="[CLS]",
-        eos_token="[SEP]",
-        unk_token="<unk>",
-        sep_token="[SEP]",
-        pad_token="<pad>",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs
-    ):
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-        outputs = outputs.replace("``", '"').replace("''", '"')
-
-        if not self.keep_accents:
-            outputs = unicodedata.normalize("NFKD", outputs)
-            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text, sample=False):
-        """ Tokenize a string. """
-        text = self.preprocess_text(text)
-
-        if not sample:
-            pieces = self.sp_model.EncodeAsPieces(text)
-        else:
-            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-        new_pieces = []
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
-                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        An ALBERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/server/transformers/src/transformers/tokenization_auto.py b/server/transformers/src/transformers/tokenization_auto.py
deleted file mode 100644
index d272b3367b29360f33e9074086f33a6cd9de56fd..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_auto.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Auto Model class. """
-
-
-import logging
-from collections import OrderedDict
-
-from .configuration_auto import (
-    AlbertConfig,
-    AutoConfig,
-    BertConfig,
-    CamembertConfig,
-    CTRLConfig,
-    DistilBertConfig,
-    FlaubertConfig,
-    GPT2Config,
-    OpenAIGPTConfig,
-    RobertaConfig,
-    T5Config,
-    TransfoXLConfig,
-    XLMConfig,
-    XLMRobertaConfig,
-    XLNetConfig,
-)
-from .configuration_utils import PretrainedConfig
-from .tokenization_albert import AlbertTokenizer
-from .tokenization_bert import BertTokenizer
-from .tokenization_bert_japanese import BertJapaneseTokenizer
-from .tokenization_camembert import CamembertTokenizer
-from .tokenization_ctrl import CTRLTokenizer
-from .tokenization_distilbert import DistilBertTokenizer
-from .tokenization_flaubert import FlaubertTokenizer
-from .tokenization_gpt2 import GPT2Tokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_roberta import RobertaTokenizer
-from .tokenization_t5 import T5Tokenizer
-from .tokenization_transfo_xl import TransfoXLTokenizer
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_xlm_roberta import XLMRobertaTokenizer
-from .tokenization_xlnet import XLNetTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-
-TOKENIZER_MAPPING = OrderedDict(
-    [
-        (T5Config, T5Tokenizer),
-        (DistilBertConfig, DistilBertTokenizer),
-        (AlbertConfig, AlbertTokenizer),
-        (CamembertConfig, CamembertTokenizer),
-        (XLMRobertaConfig, XLMRobertaTokenizer),
-        (RobertaConfig, RobertaTokenizer),
-        (BertConfig, BertTokenizer),
-        (OpenAIGPTConfig, OpenAIGPTTokenizer),
-        (GPT2Config, GPT2Tokenizer),
-        (TransfoXLConfig, TransfoXLTokenizer),
-        (XLNetConfig, XLNetTokenizer),
-        (FlaubertConfig, FlaubertTokenizer),
-        (XLMConfig, XLMTokenizer),
-        (CTRLConfig, CTRLTokenizer),
-    ]
-)
-
-
-class AutoTokenizer:
-    r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
-        that will be instantiated as one of the tokenizer classes of the library
-        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method take care of returning the correct tokenizer class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The tokenizer class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: T5Tokenizer (T5 model)
-            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
-            - contains `albert`: AlbertTokenizer (ALBERT model)
-            - contains `camembert`: CamembertTokenizer (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
-            - contains `roberta`: RobertaTokenizer (RoBERTa model)
-            - contains `bert`: BertTokenizer (Bert model)
-            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
-            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
-            - contains `xlnet`: XLNetTokenizer (XLNet model)
-            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
-
-        This class cannot be instantiated using `__init__()` (throw an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoTokenizer is designed to be instantiated "
-            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
-        )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        r""" Instantiate one of the tokenizer classes of the library
-        from a pre-trained model vocabulary.
-
-        The tokenizer class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: T5Tokenizer (T5 model)
-            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
-            - contains `albert`: AlbertTokenizer (ALBERT model)
-            - contains `camembert`: CamembertTokenizer (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
-            - contains `roberta`: RobertaTokenizer (RoBERTa model)
-            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
-            - contains `bert`: BertTokenizer (Bert model)
-            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
-            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
-            - contains `xlnet`: XLNetTokenizer (XLNet model)
-            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
-
-        Params:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the vocabulary files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
-
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
-
-        Examples::
-
-            # Download vocabulary from S3 and cache.
-            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-
-            # Download vocabulary from S3 (user-uploaded) and cache.
-            tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
-
-            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
-
-        """
-        config = kwargs.pop("config", None)
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        if "bert-base-japanese" in pretrained_model_name_or_path:
-            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-
-        for config_class, tokenizer_class in TOKENIZER_MAPPING.items():
-            if isinstance(config, config_class):
-                return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-
-        raise ValueError(
-            "Unrecognized configuration class {} to build an AutoTokenizer.\n"
-            "Model type should be one of {}.".format(
-                config.__class__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys())
-            )
-        )
diff --git a/server/transformers/src/transformers/tokenization_bert.py b/server/transformers/src/transformers/tokenization_bert.py
deleted file mode 100644
index e3f8d00c6fefc352f2bbb106adb4e77a605bbac9..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_bert.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-
-import collections
-import logging
-import os
-import unicodedata
-
-import tokenizers as tk
-
-from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-        "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
-        "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512,
-    "bert-large-uncased": 512,
-    "bert-base-cased": 512,
-    "bert-large-cased": 512,
-    "bert-base-multilingual-uncased": 512,
-    "bert-base-multilingual-cased": 512,
-    "bert-base-chinese": 512,
-    "bert-base-german-cased": 512,
-    "bert-large-uncased-whole-word-masking": 512,
-    "bert-large-cased-whole-word-masking": 512,
-    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-    "bert-base-cased-finetuned-mrpc": 512,
-    "bert-base-german-dbmdz-cased": 512,
-    "bert-base-german-dbmdz-uncased": 512,
-    "bert-base-finnish-cased-v1": 512,
-    "bert-base-finnish-uncased-v1": 512,
-    "bert-base-dutch-cased": 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {"do_lower_case": True},
-    "bert-large-uncased": {"do_lower_case": True},
-    "bert-base-cased": {"do_lower_case": False},
-    "bert-large-cased": {"do_lower_case": False},
-    "bert-base-multilingual-uncased": {"do_lower_case": True},
-    "bert-base-multilingual-cased": {"do_lower_case": False},
-    "bert-base-chinese": {"do_lower_case": False},
-    "bert-base-german-cased": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
-    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
-    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
-    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
-    "bert-base-finnish-cased-v1": {"do_lower_case": False},
-    "bert-base-finnish-uncased-v1": {"do_lower_case": True},
-    "bert-base-dutch-cased": {"do_lower_case": False},
-}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class BertTokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a BertTokenizer.
-    :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
-
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_basic_tokenize=True
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        **kwargs
-    ):
-        """Constructs a BertTokenizer.
-
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-            **do_basic_tokenize**: (`optional`) boolean (default True)
-                Whether to do basic tokenization before wordpiece.
-            **never_split**: (`optional`) list of string
-                List of tokens which will never be split during tokenization.
-                Only has an effect when do_basic_tokenize=True
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
-        """ Constructs a BasicTokenizer.
-
-        Args:
-            **do_lower_case**: Whether to lower case the input.
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-
-    def tokenize(self, text, never_split=None):
-        """ Basic Tokenization of a piece of text.
-            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-        """
-        never_split = self.never_split + (never_split if never_split is not None else [])
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)  #
-            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-        ):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
-
-
-class BertTokenizerFast(PreTrainedTokenizerFast):
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        max_length=None,
-        pad_to_max_length=False,
-        stride=0,
-        truncation_strategy="longest_first",
-        add_special_tokens=True,
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self._tokenizer = tk.Tokenizer(tk.models.WordPiece.from_files(vocab_file, unk_token=unk_token))
-        self._update_special_tokens()
-        self._tokenizer.with_pre_tokenizer(
-            tk.pre_tokenizers.BertPreTokenizer.new(
-                do_basic_tokenize=do_basic_tokenize,
-                do_lower_case=do_lower_case,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                never_split=never_split if never_split is not None else [],
-            )
-        )
-        self._tokenizer.with_decoder(tk.decoders.WordPiece.new())
-
-        if add_special_tokens:
-            self._tokenizer.with_post_processor(
-                tk.processors.BertProcessing.new(
-                    (sep_token, self._tokenizer.token_to_id(sep_token)),
-                    (cls_token, self._tokenizer.token_to_id(cls_token)),
-                )
-            )
-        if max_length is not None:
-            self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy)
-        self._tokenizer.with_padding(
-            max_length=max_length if pad_to_max_length else None,
-            direction=self.padding_side,
-            pad_id=self.pad_token_id,
-            pad_type_id=self.pad_token_type_id,
-            pad_token=self.pad_token,
-        )
-        self._decoder = tk.decoders.WordPiece.new()
diff --git a/server/transformers/src/transformers/tokenization_bert_japanese.py b/server/transformers/src/transformers/tokenization_bert_japanese.py
deleted file mode 100644
index aaf82c54b3209b6c6d84202dd24ca257d49ba13f..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_bert_japanese.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-
-import collections
-import logging
-import os
-import unicodedata
-
-from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
-        "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
-        "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
-        "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-japanese": 512,
-    "bert-base-japanese-whole-word-masking": 512,
-    "bert-base-japanese-char": 512,
-    "bert-base-japanese-char-whole-word-masking": 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-japanese": {
-        "do_lower_case": False,
-        "word_tokenizer_type": "mecab",
-        "subword_tokenizer_type": "wordpiece",
-    },
-    "bert-base-japanese-whole-word-masking": {
-        "do_lower_case": False,
-        "word_tokenizer_type": "mecab",
-        "subword_tokenizer_type": "wordpiece",
-    },
-    "bert-base-japanese-char": {
-        "do_lower_case": False,
-        "word_tokenizer_type": "mecab",
-        "subword_tokenizer_type": "character",
-    },
-    "bert-base-japanese-char-whole-word-masking": {
-        "do_lower_case": False,
-        "word_tokenizer_type": "mecab",
-        "subword_tokenizer_type": "character",
-    },
-}
-
-
-class BertJapaneseTokenizer(BertTokenizer):
-    """BERT tokenizer for Japanese text"""
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=False,
-        do_word_tokenize=True,
-        do_subword_tokenize=True,
-        word_tokenizer_type="basic",
-        subword_tokenizer_type="wordpiece",
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs
-    ):
-        """Constructs a MecabBertTokenizer.
-
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input.
-                Only has an effect when do_basic_tokenize=True.
-            **do_word_tokenize**: (`optional`) boolean (default True)
-                Whether to do word tokenization.
-            **do_subword_tokenize**: (`optional`) boolean (default True)
-                Whether to do subword tokenization.
-            **word_tokenizer_type**: (`optional`) string (default "basic")
-                Type of word tokenizer.
-            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
-                Type of subword tokenizer.
-        """
-        super(BertTokenizer, self).__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        # ^^ We call the grandparent's init, not the parent's.
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-
-        self.do_word_tokenize = do_word_tokenize
-        if do_word_tokenize:
-            if word_tokenizer_type == "basic":
-                self.word_tokenizer = BasicTokenizer(
-                    do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
-                )
-            elif word_tokenizer_type == "mecab":
-                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split)
-            else:
-                raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
-
-        self.do_subword_tokenize = do_subword_tokenize
-        if do_subword_tokenize:
-            if subword_tokenizer_type == "wordpiece":
-                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-            elif subword_tokenizer_type == "character":
-                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
-            else:
-                raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
-
-    def _tokenize(self, text):
-        if self.do_word_tokenize:
-            tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
-        else:
-            tokens = [text]
-
-        if self.do_subword_tokenize:
-            split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
-        else:
-            split_tokens = tokens
-
-        return split_tokens
-
-
-class MecabTokenizer(object):
-    """Runs basic tokenization with MeCab morphological parser."""
-
-    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
-        """Constructs a MecabTokenizer.
-
-        Args:
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input.
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-            **normalize_text**: (`optional`) boolean (default True)
-                Whether to apply unicode normalization to text before tokenization.
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split if never_split is not None else []
-        self.normalize_text = normalize_text
-
-        import MeCab
-
-        self.mecab = MeCab.Tagger()
-
-    def tokenize(self, text, never_split=None, **kwargs):
-        """Tokenizes a piece of text."""
-        if self.normalize_text:
-            text = unicodedata.normalize("NFKC", text)
-
-        never_split = self.never_split + (never_split if never_split is not None else [])
-        tokens = []
-
-        mecab_output = self.mecab.parse(text)
-
-        cursor = 0
-        for line in mecab_output.split("\n"):
-            if line == "EOS":
-                break
-
-            token, _ = line.split("\t")
-            token_start = text.index(token, cursor)
-            token_end = token_start + len(token)
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-
-            tokens.append(token)
-            cursor = token_end
-
-        return tokens
-
-
-class CharacterTokenizer(object):
-    """Runs Character tokenziation."""
-
-    def __init__(self, vocab, unk_token, normalize_text=True):
-        """Constructs a CharacterTokenizer.
-
-        Args:
-            **vocab**:
-                Vocabulary object.
-            **unk_token**: str
-                A special symbol for out-of-vocabulary token.
-            **normalize_text**: (`optional`) boolean (default True)
-                Whether to apply unicode normalization to text before tokenization.
-        """
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.normalize_text = normalize_text
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into characters.
-
-        For example:
-            input = "apple"
-            output = ["a", "p", "p", "l", "e"]
-        Args:
-            text: A single token or whitespace separated tokens.
-                This should have already been passed through `BasicTokenizer`.
-        Returns:
-            A list of characters.
-        """
-        if self.normalize_text:
-            text = unicodedata.normalize("NFKC", text)
-
-        output_tokens = []
-        for i, char in enumerate(text):
-            if char not in self.vocab:
-                output_tokens.append(self.unk_token)
-                continue
-
-            output_tokens.append(char)
-
-        return output_tokens
diff --git a/server/transformers/src/transformers/tokenization_camembert.py b/server/transformers/src/transformers/tokenization_camembert.py
deleted file mode 100644
index a158419470fb31d37b0e2e14a87bb9b219365640..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_camembert.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-""" Tokenization classes for Camembert model."""
-
-
-import logging
-import os
-from shutil import copyfile
-
-import sentencepiece as spm
-
-from transformers.tokenization_utils import PreTrainedTokenizer
-
-from .tokenization_xlnet import SPIECE_UNDERLINE
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "camembert-base": None,
-}
-
-SHARED_MODEL_IDENTIFIERS = [
-    # Load with
-    # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
-    "Musixmatch/umberto-commoncrawl-cased-v1",
-    "Musixmatch/umberto-wikipedia-uncased-v1",
-]
-
-
-class CamembertTokenizer(PreTrainedTokenizer):
-    """
-        Adapted from RobertaTokenizer and XLNetTokenizer
-        SentencePiece based tokenizer. Peculiarities:
-
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        eos_token="</s>",
-        sep_token="</s>",
-        cls_token="<s>",
-        unk_token="<unk>",
-        pad_token="<pad>",
-        mask_token="<mask>",
-        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
-        **kwargs
-    ):
-        super().__init__(
-            max_len=512,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-        # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
-        # sentencepiece vocabulary (this is the case for <s> and </s>
-        self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
-        self.fairseq_offset = len(self.fairseq_tokens_to_ids)
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A RoBERTa sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    @property
-    def vocab_size(self):
-        return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
-
-    def _tokenize(self, text):
-        return self.sp_model.EncodeAsPieces(text)
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        elif self.sp_model.PieceToId(token) == 0:
-            # Convert sentence piece unk token to fairseq unk token index
-            return self.unk_token_id
-        return self.fairseq_offset + self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/server/transformers/src/transformers/tokenization_ctrl.py b/server/transformers/src/transformers/tokenization_ctrl.py
deleted file mode 100644
index 1f2184f0a12e31f7a5a575758781b47b5294cfd0..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_ctrl.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Salesforce CTRL."""
-
-
-import json
-import logging
-import os
-
-import regex as re
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
-    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "ctrl": 256,
-}
-
-CONTROL_CODES = {
-    "Pregnancy": 168629,
-    "Christianity": 7675,
-    "Explain": 106423,
-    "Fitness": 63440,
-    "Saving": 63163,
-    "Ask": 27171,
-    "Ass": 95985,
-    "Joke": 163509,
-    "Questions": 45622,
-    "Thoughts": 49605,
-    "Retail": 52342,
-    "Feminism": 164338,
-    "Writing": 11992,
-    "Atheism": 192263,
-    "Netflix": 48616,
-    "Computing": 39639,
-    "Opinion": 43213,
-    "Alone": 44967,
-    "Funny": 58917,
-    "Gaming": 40358,
-    "Human": 4088,
-    "India": 1331,
-    "Joker": 77138,
-    "Diet": 36206,
-    "Legal": 11859,
-    "Norman": 4939,
-    "Tip": 72689,
-    "Weight": 52343,
-    "Movies": 46273,
-    "Running": 23425,
-    "Science": 2090,
-    "Horror": 37793,
-    "Confession": 60572,
-    "Finance": 12250,
-    "Politics": 16360,
-    "Scary": 191985,
-    "Support": 12654,
-    "Technologies": 32516,
-    "Teenage": 66160,
-    "Event": 32769,
-    "Learned": 67460,
-    "Notion": 182770,
-    "Wikipedia": 37583,
-    "Books": 6665,
-    "Extract": 76050,
-    "Confessions": 102701,
-    "Conspiracy": 75932,
-    "Links": 63674,
-    "Narcissus": 150425,
-    "Relationship": 54766,
-    "Relationships": 134796,
-    "Reviews": 41671,
-    "News": 4256,
-    "Translation": 26820,
-    "multilingual": 128406,
-}
-
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-
-    pairs = set(pairs)
-    return pairs
-
-
-class CTRLTokenizer(PreTrainedTokenizer):
-    """
-    CTRL BPE tokenizer. Peculiarities:
-        - Byte-Pair-Encoding
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    control_codes = CONTROL_CODES
-
-    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
-        super().__init__(unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = "@@ ".join(word)
-        word = word[:-4]
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """ Tokenize a string.
-        """
-        split_tokens = []
-
-        words = re.findall(r"\S+\n?", text)
-
-        for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace("@@ ", "").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-    #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
-    #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
-    #     tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
-    #     return ''.join(tokens_generated_so_far)
diff --git a/server/transformers/src/transformers/tokenization_distilbert.py b/server/transformers/src/transformers/tokenization_distilbert.py
deleted file mode 100644
index 82dbfdb414f63cc1fc5606c188298e387ef37376..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_distilbert.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for DistilBERT."""
-
-
-import logging
-
-from .tokenization_bert import BertTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
-        "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "distilbert-base-uncased": 512,
-    "distilbert-base-uncased-distilled-squad": 512,
-    "distilbert-base-german-cased": 512,
-    "distilbert-base-multilingual-cased": 512,
-}
-
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "distilbert-base-uncased": {"do_lower_case": True},
-    "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
-    "distilbert-base-german-cased": {"do_lower_case": False},
-    "distilbert-base-multilingual-cased": {"do_lower_case": False},
-}
-
-
-class DistilBertTokenizer(BertTokenizer):
-    r"""
-    Constructs a DistilBertTokenizer.
-    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
-
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_basic_tokenize=True
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/server/transformers/src/transformers/tokenization_flaubert.py b/server/transformers/src/transformers/tokenization_flaubert.py
deleted file mode 100644
index e648a61c94f4d6aa3a8ffca9de25b4854edcdbc2..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_flaubert.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Flaubert, based on XLM."""
-
-
-import logging
-import unicodedata
-
-import six
-
-from .tokenization_xlm import XLMTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/vocab.json",
-        "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/vocab.json",
-        "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/vocab.json",
-        "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/vocab.json",
-    },
-    "merges_file": {
-        "flaubert-small-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_small_cased/merges.txt",
-        "flaubert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_uncased/merges.txt",
-        "flaubert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_base_cased/merges.txt",
-        "flaubert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/flaubert/flaubert_large_cased/merges.txt",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "flaubert-small-cased": 512,
-    "flaubert-base-uncased": 512,
-    "flaubert-base-cased": 512,
-    "flaubert-large-cased": 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "flaubert-small-cased": {"do_lowercase": False},
-    "flaubert-base-uncased": {"do_lowercase": True},
-    "flaubert-base-cased": {"do_lowercase": False},
-    "flaubert-large-cased": {"do_lowercase": False},
-}
-
-
-def convert_to_unicode(text):
-    """
-    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
-    """
-    # six_ensure_text is copied from https://github.com/benjaminp/six
-    def six_ensure_text(s, encoding="utf-8", errors="strict"):
-        if isinstance(s, six.binary_type):
-            return s.decode(encoding, errors)
-        elif isinstance(s, six.text_type):
-            return s
-        else:
-            raise TypeError("not expecting type '%s'" % type(s))
-
-    return six_ensure_text(text, encoding="utf-8", errors="ignore")
-
-
-class FlaubertTokenizer(XLMTokenizer):
-    """
-    BPE tokenizer for Flaubert
-
-        - Moses preprocessing & tokenization
-
-        - Normalize all inputs text
-
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        (ex: "__classify__") to a vocabulary
-
-        - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(self, do_lowercase=False, **kwargs):
-        super().__init__(**kwargs)
-        self.do_lowercase = do_lowercase
-        self.do_lowercase_and_remove_accent = False
-
-    def preprocess_text(self, text):
-        text = text.replace("``", '"').replace("''", '"')
-        text = convert_to_unicode(text)
-        text = unicodedata.normalize("NFC", text)
-
-        if self.do_lowercase:
-            text = text.lower()
-
-        return text
-
-    def _tokenize(self, text, bypass_tokenizer=False):
-        """
-        Tokenize a string given language code using Moses.
-
-        Details of tokenization:
-        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
-            - Install with `pip install sacremoses`
-
-        Args:
-            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
-
-        Returns:
-            List of tokens.
-        """
-        lang = "fr"
-        if lang and self.lang2id and lang not in self.lang2id:
-            logger.error(
-                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
-            )
-
-        if bypass_tokenizer:
-            text = text.split()
-        else:
-            text = self.preprocess_text(text)
-            text = self.moses_pipeline(text, lang=lang)
-            text = self.moses_tokenize(text, lang=lang)
-
-        split_tokens = []
-        for token in text:
-            if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
-
-        return split_tokens
diff --git a/server/transformers/src/transformers/tokenization_gpt2.py b/server/transformers/src/transformers/tokenization_gpt2.py
deleted file mode 100644
index 4f2de845b569bc8f38880fab521607610e4024d8..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_gpt2.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-
-import json
-import logging
-import os
-from functools import lru_cache
-
-import regex as re
-import tokenizers as tk
-
-from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
-        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
-        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
-        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
-    },
-    "merges_file": {
-        "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-        "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
-        "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
-        "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
-        "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "gpt2": 1024,
-    "gpt2-medium": 1024,
-    "gpt2-large": 1024,
-    "gpt2-xl": 1024,
-    "distilgpt2": 1024,
-}
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings.
-    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2 ** 8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2 ** 8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class GPT2Tokenizer(PreTrainedTokenizer):
-    """
-    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding and tokenize methods should be called with the
-          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
-          the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        **kwargs
-    ):
-        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-        self.max_len_single_sentence = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text, add_prefix_space=False):
-        """ Tokenize a string.
-            Args:
-                - add_prefix_space (boolean, default False):
-                    Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
-        """
-        if add_prefix_space:
-            text = " " + text
-
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-
-class GPT2TokenizerFast(PreTrainedTokenizerFast):
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        pad_to_max_length=False,
-        add_prefix_space=False,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        **kwargs
-    ):
-        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
-
-        self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file))
-        self._update_special_tokens()
-        self._tokenizer.with_pre_tokenizer(tk.pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space))
-        self._tokenizer.with_decoder(tk.decoders.ByteLevel.new())
-        if max_length:
-            self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy)
-        self._tokenizer.with_padding(
-            max_length=max_length if pad_to_max_length else None,
-            direction=self.padding_side,
-            pad_id=self.pad_token_id if self.pad_token_id is not None else 0,
-            pad_type_id=self.pad_token_type_id,
-            pad_token=self.pad_token if self.pad_token is not None else "",
-        )
-        self._decoder = tk.decoders.ByteLevel.new()
diff --git a/server/transformers/src/transformers/tokenization_openai.py b/server/transformers/src/transformers/tokenization_openai.py
deleted file mode 100644
index eca9f81c3ef631d6f27f34965eadc5c793c928e1..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_openai.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-
-import json
-import logging
-import os
-import re
-
-from .tokenization_bert import BasicTokenizer
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"},
-    "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "openai-gpt": 512,
-}
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-    word is represented as tuple of symbols (symbols being variable-length strings)
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-def text_standardize(text):
-    """
-    fixes some issues the spacy tokenizer had on books corpus
-    also does some whitespace standardization
-    """
-    text = text.replace("—", "-")
-    text = text.replace("–", "-")
-    text = text.replace("―", "-")
-    text = text.replace("…", "...")
-    text = text.replace("´", "'")
-    text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
-    text = re.sub(r"\s*\n\s*", " \n ", text)
-    text = re.sub(r"[^\S\n]+", " ", text)
-    return text.strip()
-
-
-class OpenAIGPTTokenizer(PreTrainedTokenizer):
-    """
-    BPE tokenizer. Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
-        super().__init__(unk_token=unk_token, **kwargs)
-
-        self.max_len_single_sentence = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-
-        try:
-            import ftfy
-            from spacy.lang.en import English
-
-            _nlp = English()
-            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
-            self.fix_text = None
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + "</w>",)
-        if token in self.cache:
-            return self.cache[token]
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token + "</w>"
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        if word == "\n  </w>":
-            word = "\n</w>"
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """ Tokenize a string. """
-        split_tokens = []
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer
-            text = self.nlp.tokenize(text)
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
-        else:
-            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
-            text = self.nlp(text_standardize(self.fix_text(text)))
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")])
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an id in a token (BPE) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = "".join(tokens).replace("</w>", " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
diff --git a/server/transformers/src/transformers/tokenization_roberta.py b/server/transformers/src/transformers/tokenization_roberta.py
deleted file mode 100644
index caaaf98cd0dbd90f8b944328a96403be5e3ebb6e..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_roberta.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for RoBERTa."""
-
-
-import logging
-
-from .tokenization_gpt2 import GPT2Tokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
-        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
-        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
-        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
-        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
-    },
-    "merges_file": {
-        "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
-        "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
-        "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
-        "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
-        "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "roberta-base": 512,
-    "roberta-large": 512,
-    "roberta-large-mnli": 512,
-    "distilroberta-base": 512,
-    "roberta-base-openai-detector": 512,
-    "roberta-large-openai-detector": 512,
-}
-
-
-class RobertaTokenizer(GPT2Tokenizer):
-    """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding methods should be called with the
-          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        bos_token="<s>",
-        eos_token="</s>",
-        sep_token="</s>",
-        cls_token="<s>",
-        unk_token="<unk>",
-        pad_token="<pad>",
-        mask_token="<mask>",
-        **kwargs
-    ):
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
diff --git a/server/transformers/src/transformers/tokenization_t5.py b/server/transformers/src/transformers/tokenization_t5.py
deleted file mode 100644
index 2196cc82e726effbf8d8339626efd9ac38c6faf7..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_t5.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2018 T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization class for model T5."""
-
-
-import logging
-import os
-import re
-from shutil import copyfile
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-SPIECE_UNDERLINE = "▁"
-
-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model shortcut names.
-####################################################
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-    }
-}
-
-####################################################
-# Mapping from model shortcut names to max length of inputs
-####################################################
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "t5-small": 512,
-    "t5-base": 512,
-    "t5-large": 512,
-    "t5-3b": 512,
-    "t5-11b": 512,
-}
-
-
-class T5Tokenizer(PreTrainedTokenizer):
-    """
-        SentencePiece based tokenizer. Peculiarities:
-
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
-            - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
-                These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
-                Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
-                (like in T5 preprocessing
-                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        eos_token="</s>",
-        unk_token="<unk>",
-        pad_token="<pad>",
-        extra_ids=100,
-        additional_special_tokens=None,
-        **kwargs
-    ):
-        # Add extra_ids to the special token list
-        if extra_ids > 0:
-            if additional_special_tokens is None:
-                additional_special_tokens = []
-            additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
-
-        super().__init__(
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use T5Tokenizer:"
-                "https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-
-        self.vocab_file = vocab_file
-        self._extra_ids = extra_ids
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-
-    @property
-    def vocab_size(self):
-        return self.sp_model.get_piece_size() + self._extra_ids
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text, sample=False):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
-        if not sample:
-            pieces = self.sp_model.EncodeAsPieces(text)
-        else:
-            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-        return pieces
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        if token.startswith("<extra_id_"):
-            match = re.match(r"<extra_id_(\d+)>", token)
-            num = int(match.group(1))
-            return self.vocab_size - num - 1
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index < self.sp_model.get_piece_size():
-            token = self.sp_model.IdToPiece(index)
-        else:
-            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = self.sp_model.decode_pieces(tokens)
-        return out_string
-
-    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/server/transformers/src/transformers/tokenization_transfo_xl.py b/server/transformers/src/transformers/tokenization_transfo_xl.py
deleted file mode 100644
index 9d847e6f8ca491219d5b96b8a1ec38cdb819bf79..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_transfo_xl.py
+++ /dev/null
@@ -1,581 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization classes for Transformer XL model.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-"""
-
-
-import glob
-import logging
-import os
-import pickle
-from collections import Counter, OrderedDict
-
-import numpy as np
-
-from .file_utils import cached_path, is_torch_available
-from .tokenization_utils import PreTrainedTokenizer
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "pretrained_vocab_file": {
-        "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "transfo-xl-wt103": None,
-}
-
-PRETRAINED_CORPUS_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
-}
-CORPUS_NAME = "corpus.bin"
-
-
-class TransfoXLTokenizer(PreTrainedTokenizer):
-    """
-    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        special=None,
-        min_freq=0,
-        max_size=None,
-        lower_case=False,
-        delimiter=None,
-        vocab_file=None,
-        pretrained_vocab_file=None,
-        never_split=None,
-        unk_token="<unk>",
-        eos_token="<eos>",
-        additional_special_tokens=["<formula>"],
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
-        )
-
-        self.max_len_single_sentence = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-        self.max_len_sentences_pair = (
-            self.max_len
-        )  # no default special tokens - you can update this value if you add special tokens
-
-        if never_split is None:
-            never_split = self.all_special_tokens
-        if special is None:
-            special = []
-        self.counter = Counter()
-        self.special = special
-        self.min_freq = min_freq
-        self.max_size = max_size
-        self.lower_case = lower_case
-        self.delimiter = delimiter
-        self.vocab_file = vocab_file
-        self.never_split = never_split
-
-        if pretrained_vocab_file is not None:
-            # Hack because, honestly this tokenizer was not made to be used
-            # in a library like ours, at all.
-            vocab_dict = torch.load(pretrained_vocab_file)
-            for key, value in vocab_dict.items():
-                if key not in self.__dict__:
-                    self.__dict__[key] = value
-
-        if vocab_file is not None:
-            self.build_vocab()
-
-    def count_file(self, path, verbose=False, add_eos=False):
-        if verbose:
-            logger.info("counting file {} ...".format(path))
-        assert os.path.exists(path)
-
-        sents = []
-        with open(path, "r", encoding="utf-8") as f:
-            for idx, line in enumerate(f):
-                if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info("    line {}".format(idx))
-                symbols = self.tokenize(line, add_eos=add_eos)
-                self.counter.update(symbols)
-                sents.append(symbols)
-
-        return sents
-
-    def count_sents(self, sents, verbose=False):
-        """
-            sents : a list of sentences, each a list of tokenized symbols
-        """
-        if verbose:
-            logger.info("counting {} sents ...".format(len(sents)))
-        for idx, symbols in enumerate(sents):
-            if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info("    line {}".format(idx))
-            self.counter.update(symbols)
-
-    def _build_from_file(self, vocab_file):
-        self.idx2sym = []
-        self.sym2idx = OrderedDict()
-
-        with open(vocab_file, "r", encoding="utf-8") as f:
-            for line in f:
-                symb = line.strip().split()[0]
-                self.add_symbol(symb)
-        if "<UNK>" in self.sym2idx:
-            self.unk_idx = self.sym2idx["<UNK>"]
-        elif "<unk>" in self.sym2idx:
-            self.unk_idx = self.sym2idx["<unk>"]
-        else:
-            raise ValueError("No <unkown> token in vocabulary")
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
-        torch.save(self.__dict__, vocab_file)
-        return (vocab_file,)
-
-    def build_vocab(self):
-        if self.vocab_file:
-            logger.info("building vocab from {}".format(self.vocab_file))
-            self._build_from_file(self.vocab_file)
-            logger.info("final vocab size {}".format(len(self)))
-        else:
-            logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size))
-            self.idx2sym = []
-            self.sym2idx = OrderedDict()
-
-            for sym in self.special:
-                self.add_special(sym)
-
-            for sym, cnt in self.counter.most_common(self.max_size):
-                if cnt < self.min_freq:
-                    break
-                self.add_symbol(sym)
-
-            logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter)))
-
-    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
-        if verbose:
-            logger.info("encoding file {} ...".format(path))
-        assert os.path.exists(path)
-        encoded = []
-        with open(path, "r", encoding="utf-8") as f:
-            for idx, line in enumerate(f):
-                if verbose and idx > 0 and idx % 500000 == 0:
-                    logger.info("    line {}".format(idx))
-                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
-                encoded.append(self.convert_to_tensor(symbols))
-
-        if ordered:
-            encoded = torch.cat(encoded)
-
-        return encoded
-
-    def encode_sents(self, sents, ordered=False, verbose=False):
-        if verbose:
-            logger.info("encoding {} sents ...".format(len(sents)))
-        encoded = []
-        for idx, symbols in enumerate(sents):
-            if verbose and idx > 0 and idx % 500000 == 0:
-                logger.info("    line {}".format(idx))
-            encoded.append(self.convert_to_tensor(symbols))
-
-        if ordered:
-            encoded = torch.cat(encoded)
-
-        return encoded
-
-    def add_special(self, sym):
-        if sym not in self.sym2idx:
-            self.idx2sym.append(sym)
-            self.sym2idx[sym] = len(self.idx2sym) - 1
-            setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym])
-
-    def add_symbol(self, sym):
-        if sym not in self.sym2idx:
-            self.idx2sym.append(sym)
-            self.sym2idx[sym] = len(self.idx2sym) - 1
-
-    def _convert_id_to_token(self, idx):
-        """Converts an id in a token (BPE) using the vocab."""
-        assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx)
-        return self.idx2sym[idx]
-
-    def _convert_token_to_id(self, sym):
-        """ Converts a token (str) in an id using the vocab. """
-        if sym in self.sym2idx:
-            return self.sym2idx[sym]
-        else:
-            # logger.info('encounter unk {}'.format(sym))
-            # assert '<eos>' not in sym
-            if hasattr(self, "unk_idx"):
-                return self.sym2idx.get(sym, self.unk_idx)
-            # Backward compatibility with pre-trained models
-            elif "<unk>" in self.sym2idx:
-                return self.sym2idx["<unk>"]
-            elif "<UNK>" in self.sym2idx:
-                return self.sym2idx["<UNK>"]
-            else:
-                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).strip()
-        return out_string
-
-    def convert_to_tensor(self, symbols):
-        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
-
-    @property
-    def vocab_size(self):
-        return len(self.idx2sym)
-
-    def _tokenize(self, line, add_eos=False, add_double_eos=False):
-        line = line.strip()
-        # convert to lower case
-        if self.lower_case:
-            line = line.lower()
-
-        # empty delimiter '' will evaluate False
-        if self.delimiter == "":
-            symbols = line
-        else:
-            symbols = line.split(self.delimiter)
-
-        if add_double_eos:  # lm1b
-            return ["<S>"] + symbols + ["<S>"]
-        elif add_eos:
-            return symbols + ["<eos>"]
-        else:
-            return symbols
-
-
-class LMOrderedIterator(object):
-    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
-        """
-            data -- LongTensor -- the LongTensor is strictly ordered
-        """
-        self.bsz = bsz
-        self.bptt = bptt
-        self.ext_len = ext_len if ext_len is not None else 0
-
-        self.device = device
-
-        # Work out how cleanly we can divide the dataset into bsz parts.
-        self.n_step = data.size(0) // bsz
-
-        # Trim off any extra elements that wouldn't cleanly fit (remainders).
-        data = data.narrow(0, 0, self.n_step * bsz)
-
-        # Evenly divide the data across the bsz batches.
-        self.data = data.view(bsz, -1).t().contiguous().to(device)
-
-        # Number of mini-batches
-        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
-
-    def get_batch(self, i, bptt=None):
-        if bptt is None:
-            bptt = self.bptt
-        seq_len = min(bptt, self.data.size(0) - 1 - i)
-
-        end_idx = i + seq_len
-        beg_idx = max(0, i - self.ext_len)
-
-        data = self.data[beg_idx:end_idx]
-        target = self.data[i + 1 : i + 1 + seq_len]
-
-        data_out = data.transpose(0, 1).contiguous().to(self.device)
-        target_out = target.transpose(0, 1).contiguous().to(self.device)
-
-        return data_out, target_out, seq_len
-
-    def get_fixlen_iter(self, start=0):
-        for i in range(start, self.data.size(0) - 1, self.bptt):
-            yield self.get_batch(i)
-
-    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
-        max_len = self.bptt + max_deviation * std
-        i = start
-        while True:
-            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
-            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
-            data, target, seq_len = self.get_batch(i, bptt)
-            i += seq_len
-            yield data, target, seq_len
-            if i >= self.data.size(0) - 2:
-                break
-
-    def __iter__(self):
-        return self.get_fixlen_iter()
-
-
-class LMShuffledIterator(object):
-    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
-        """
-            data -- list[LongTensor] -- there is no order among the LongTensors
-        """
-        self.data = data
-
-        self.bsz = bsz
-        self.bptt = bptt
-        self.ext_len = ext_len if ext_len is not None else 0
-
-        self.device = device
-        self.shuffle = shuffle
-
-    def get_sent_stream(self):
-        # index iterator
-        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
-
-        # sentence iterator
-        for idx in epoch_indices:
-            yield self.data[idx]
-
-    def stream_iterator(self, sent_stream):
-        # streams for each data in the batch
-        streams = [None] * self.bsz
-
-        data = torch.LongTensor(self.bptt, self.bsz)
-        target = torch.LongTensor(self.bptt, self.bsz)
-
-        n_retain = 0
-
-        while True:
-            # data   : [n_retain+bptt x bsz]
-            # target : [bptt x bsz]
-            data[n_retain:].fill_(-1)
-            target.fill_(-1)
-
-            valid_batch = True
-
-            for i in range(self.bsz):
-                n_filled = 0
-                try:
-                    while n_filled < self.bptt:
-                        if streams[i] is None or len(streams[i]) <= 1:
-                            streams[i] = next(sent_stream)
-                        # number of new tokens to fill in
-                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
-                        # first n_retain tokens are retained from last batch
-                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
-                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
-                        streams[i] = streams[i][n_new:]
-                        n_filled += n_new
-                except StopIteration:
-                    valid_batch = False
-                    break
-
-            if not valid_batch:
-                return
-
-            data_out = data.transpose(0, 1).contiguous().to(self.device)
-            target_out = target.transpose(0, 1).contiguous().to(self.device)
-
-            yield data_out, target_out, self.bptt
-
-            n_retain = min(data.size(0), self.ext_len)
-            if n_retain > 0:
-                data[:n_retain] = data[-n_retain:]
-            data.resize_(n_retain + self.bptt, data.size(1))
-
-    def __iter__(self):
-        # sent_stream is an iterator
-        sent_stream = self.get_sent_stream()
-
-        for batch in self.stream_iterator(sent_stream):
-            yield batch
-
-
-class LMMultiFileIterator(LMShuffledIterator):
-    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
-
-        self.paths = paths
-        self.vocab = vocab
-
-        self.bsz = bsz
-        self.bptt = bptt
-        self.ext_len = ext_len if ext_len is not None else 0
-
-        self.device = device
-        self.shuffle = shuffle
-
-    def get_sent_stream(self, path):
-        sents = self.vocab.encode_file(path, add_double_eos=True)
-        if self.shuffle:
-            np.random.shuffle(sents)
-        sent_stream = iter(sents)
-
-        return sent_stream
-
-    def __iter__(self):
-        if self.shuffle:
-            np.random.shuffle(self.paths)
-
-        for path in self.paths:
-            # sent_stream is an iterator
-            sent_stream = self.get_sent_stream(path)
-            for batch in self.stream_iterator(sent_stream):
-                yield batch
-
-
-class TransfoXLCorpus(object):
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a pre-processed corpus.
-        """
-        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
-            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Corpus '{}' was not found in corpus list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    corpus_file,
-                )
-            )
-            return None
-        if resolved_corpus_file == corpus_file:
-            logger.info("loading corpus file {}".format(corpus_file))
-        else:
-            logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file))
-
-        # Instantiate tokenizer.
-        corpus = cls(*inputs, **kwargs)
-        corpus_dict = torch.load(resolved_corpus_file)
-        for key, value in corpus_dict.items():
-            corpus.__dict__[key] = value
-        corpus.vocab = vocab
-        if corpus.train is not None:
-            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
-        if corpus.valid is not None:
-            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
-        if corpus.test is not None:
-            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
-        return corpus
-
-    def __init__(self, *args, **kwargs):
-        self.vocab = TransfoXLTokenizer(*args, **kwargs)
-        self.dataset = None
-        self.train = None
-        self.valid = None
-        self.test = None
-
-    def build_corpus(self, path, dataset):
-        self.dataset = dataset
-
-        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
-            self.vocab.count_file(os.path.join(path, "train.txt"))
-            self.vocab.count_file(os.path.join(path, "valid.txt"))
-            self.vocab.count_file(os.path.join(path, "test.txt"))
-        elif self.dataset == "wt103":
-            self.vocab.count_file(os.path.join(path, "train.txt"))
-        elif self.dataset == "lm1b":
-            train_path_pattern = os.path.join(
-                path,
-                "1-billion-word-language-modeling-benchmark-r13output",
-                "training-monolingual.tokenized.shuffled",
-                "news.en-*",
-            )
-            train_paths = glob.glob(train_path_pattern)
-            # the vocab will load from file when build_vocab() is called
-
-        self.vocab.build_vocab()
-
-        if self.dataset in ["ptb", "wt2", "wt103"]:
-            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
-            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
-            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
-        elif self.dataset in ["enwik8", "text8"]:
-            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
-            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
-            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
-        elif self.dataset == "lm1b":
-            self.train = train_paths
-            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
-            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
-
-    def get_iterator(self, split, *args, **kwargs):
-        if split == "train":
-            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
-                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
-            elif self.dataset == "lm1b":
-                kwargs["shuffle"] = True
-                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
-        elif split in ["valid", "test"]:
-            data = self.valid if split == "valid" else self.test
-            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
-                data_iter = LMOrderedIterator(data, *args, **kwargs)
-            elif self.dataset == "lm1b":
-                data_iter = LMShuffledIterator(data, *args, **kwargs)
-
-        return data_iter
-
-
-def get_lm_corpus(datadir, dataset):
-    fn = os.path.join(datadir, "cache.pt")
-    fn_pickle = os.path.join(datadir, "cache.pkl")
-    if os.path.exists(fn):
-        logger.info("Loading cached dataset...")
-        corpus = torch.load(fn_pickle)
-    elif os.path.exists(fn):
-        logger.info("Loading cached dataset from pickle...")
-        with open(fn, "rb") as fp:
-            corpus = pickle.load(fp)
-    else:
-        logger.info("Producing dataset {}...".format(dataset))
-        kwargs = {}
-        if dataset in ["wt103", "wt2"]:
-            kwargs["special"] = ["<eos>"]
-            kwargs["lower_case"] = False
-        elif dataset == "ptb":
-            kwargs["special"] = ["<eos>"]
-            kwargs["lower_case"] = True
-        elif dataset == "lm1b":
-            kwargs["special"] = []
-            kwargs["lower_case"] = False
-            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
-        elif dataset in ["enwik8", "text8"]:
-            pass
-
-        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
-        torch.save(corpus, fn)
-
-    return corpus
diff --git a/server/transformers/src/transformers/tokenization_utils.py b/server/transformers/src/transformers/tokenization_utils.py
deleted file mode 100644
index 469181325aaa9ab582ba462a381b93e7761bdd7a..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_utils.py
+++ /dev/null
@@ -1,1615 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-
-import copy
-import itertools
-import json
-import logging
-import os
-import re
-
-from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
-
-
-if is_tf_available():
-    import tensorflow as tf
-if is_torch_available():
-    import torch
-
-logger = logging.getLogger(__name__)
-
-SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
-ADDED_TOKENS_FILE = "added_tokens.json"
-TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-
-
-class PreTrainedTokenizer(object):
-    """ Base class for all tokenizers.
-    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
-
-    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
-
-    Class attributes (overridden by derived classes):
-
-        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
-        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
-        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
-        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
-
-    Parameters:
-
-        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
-
-        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
-
-        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
-
-        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
-
-        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
-
-        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
-
-        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
-
-        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
-    """
-
-    vocab_files_names = {}
-    pretrained_vocab_files_map = {}
-    pretrained_init_configuration = {}
-    max_model_input_sizes = {}
-
-    SPECIAL_TOKENS_ATTRIBUTES = [
-        "bos_token",
-        "eos_token",
-        "unk_token",
-        "sep_token",
-        "pad_token",
-        "cls_token",
-        "mask_token",
-        "additional_special_tokens",
-    ]
-
-    padding_side = "right"
-
-    @property
-    def bos_token(self):
-        """ Beginning of sentence token (string). Log an error if used while not having been set. """
-        if self._bos_token is None:
-            logger.error("Using bos_token, but it is not set yet.")
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        """ End of sentence token (string). Log an error if used while not having been set. """
-        if self._eos_token is None:
-            logger.error("Using eos_token, but it is not set yet.")
-        return self._eos_token
-
-    @property
-    def unk_token(self):
-        """ Unknown token (string). Log an error if used while not having been set. """
-        if self._unk_token is None:
-            logger.error("Using unk_token, but it is not set yet.")
-        return self._unk_token
-
-    @property
-    def sep_token(self):
-        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        if self._sep_token is None:
-            logger.error("Using sep_token, but it is not set yet.")
-        return self._sep_token
-
-    @property
-    def pad_token(self):
-        """ Padding token (string). Log an error if used while not having been set. """
-        if self._pad_token is None:
-            logger.error("Using pad_token, but it is not set yet.")
-        return self._pad_token
-
-    @property
-    def cls_token(self):
-        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        if self._cls_token is None:
-            logger.error("Using cls_token, but it is not set yet.")
-        return self._cls_token
-
-    @property
-    def mask_token(self):
-        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        if self._mask_token is None:
-            logger.error("Using mask_token, but it is not set yet.")
-        return self._mask_token
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
-        if self._additional_special_tokens is None:
-            logger.error("Using additional_special_tokens, but it is not set yet.")
-        return self._additional_special_tokens
-
-    @bos_token.setter
-    def bos_token(self, value):
-        self._bos_token = value
-
-    @eos_token.setter
-    def eos_token(self, value):
-        self._eos_token = value
-
-    @unk_token.setter
-    def unk_token(self, value):
-        self._unk_token = value
-
-    @sep_token.setter
-    def sep_token(self, value):
-        self._sep_token = value
-
-    @pad_token.setter
-    def pad_token(self, value):
-        self._pad_token = value
-
-    @cls_token.setter
-    def cls_token(self, value):
-        self._cls_token = value
-
-    @mask_token.setter
-    def mask_token(self, value):
-        self._mask_token = value
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-
-    @property
-    def bos_token_id(self):
-        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.bos_token)
-
-    @property
-    def eos_token_id(self):
-        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.eos_token)
-
-    @property
-    def unk_token_id(self):
-        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.unk_token)
-
-    @property
-    def sep_token_id(self):
-        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.sep_token)
-
-    @property
-    def pad_token_id(self):
-        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.pad_token)
-
-    @property
-    def pad_token_type_id(self):
-        """ Id of the padding token type in the vocabulary."""
-        return self._pad_token_type_id
-
-    @property
-    def cls_token_id(self):
-        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.cls_token)
-
-    @property
-    def mask_token_id(self):
-        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.mask_token)
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.additional_special_tokens)
-
-    def __init__(self, max_len=None, **kwargs):
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
-        self._pad_token_type_id = 0
-        self._additional_special_tokens = []
-
-        self.max_len = max_len if max_len is not None else int(1e12)
-
-        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
-        self.padding_side = kwargs.pop("padding_side", self.padding_side)
-
-        # Added tokens
-        self.added_tokens_encoder = {}
-        self.unique_added_tokens_encoder = set()
-        self.added_tokens_decoder = {}
-
-        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
-        self.init_inputs = ()
-        self.init_kwargs = {}
-
-        for key, value in kwargs.items():
-            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                else:
-                    assert isinstance(value, str)
-                setattr(self, key, value)
-
-    @classmethod
-    def from_pretrained(cls, *inputs, **kwargs):
-        r"""
-        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
-
-        Args:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the vocabulary files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
-
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
-
-        Examples::
-
-            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
-
-            # Download vocabulary from S3 and cache.
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-            # Download vocabulary from S3 (user-uploaded) and cache.
-            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
-
-            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
-
-            # If the tokenizer uses a single vocabulary file, you can point directly to this file
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
-
-            # You can link tokens to special vocabulary when instantiating
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
-            # You should be sure '<unk>' is in the vocabulary when doing that.
-            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-            assert tokenizer.unk_token == '<unk>'
-
-        """
-        return cls._from_pretrained(*inputs, **kwargs)
-
-    @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-
-        s3_models = list(cls.max_model_input_sizes.keys())
-        vocab_files = {}
-        init_configuration = {}
-        if pretrained_model_name_or_path in s3_models:
-            # Get the vocabulary from AWS S3 bucket
-            for file_id, map_list in cls.pretrained_vocab_files_map.items():
-                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-            if (
-                cls.pretrained_init_configuration
-                and pretrained_model_name_or_path in cls.pretrained_init_configuration
-            ):
-                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
-        else:
-            # Get the vocabulary from local files
-            logger.info(
-                "Model name '{}' not found in model shortcut name list ({}). "
-                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
-                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
-                )
-            )
-
-            if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                if len(cls.vocab_files_names) > 1:
-                    raise ValueError(
-                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
-                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
-                    )
-                logger.warning(
-                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
-                        cls.__name__
-                    )
-                )
-                file_id = list(cls.vocab_files_names.keys())[0]
-                vocab_files[file_id] = pretrained_model_name_or_path
-            else:
-                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
-                additional_files_names = {
-                    "added_tokens_file": ADDED_TOKENS_FILE,
-                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
-                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                }
-                # Look for the tokenizer main vocabulary files + the additional tokens files
-                for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
-                    if os.path.isdir(pretrained_model_name_or_path):
-                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                        if not os.path.exists(full_file_name):
-                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                            full_file_name = None
-                    else:
-                        full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
-
-                    vocab_files[file_id] = full_file_name
-
-        # Get files from url, cache, or disk depending on the case
-        try:
-            resolved_vocab_files = {}
-            for file_id, file_path in vocab_files.items():
-                if file_path is None:
-                    resolved_vocab_files[file_id] = None
-                else:
-                    resolved_vocab_files[file_id] = cached_path(
-                        file_path,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                    )
-        except EnvironmentError:
-            if pretrained_model_name_or_path in s3_models:
-                msg = "Couldn't reach server at '{}' to download vocabulary files."
-            else:
-                msg = (
-                    "Model name '{}' was not found in tokenizers model name list ({}). "
-                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
-                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ", ".join(s3_models),
-                        pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values()),
-                    )
-                )
-
-            raise EnvironmentError(msg)
-
-        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
-            raise EnvironmentError(
-                "Model name '{}' was not found in tokenizers model name list ({}). "
-                "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
-                "named {} but couldn't find such vocabulary files at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ", ".join(s3_models),
-                    pretrained_model_name_or_path,
-                    list(cls.vocab_files_names.values()),
-                )
-            )
-
-        for file_id, file_path in vocab_files.items():
-            if file_path == resolved_vocab_files[file_id]:
-                logger.info("loading file {}".format(file_path))
-            else:
-                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
-
-        # Prepare tokenizer initialization kwargs
-        # Did we saved some inputs and kwargs to reload ?
-        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
-        if tokenizer_config_file is not None:
-            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
-                init_kwargs = json.load(tokenizer_config_handle)
-            saved_init_inputs = init_kwargs.pop("init_inputs", ())
-            if not init_inputs:
-                init_inputs = saved_init_inputs
-        else:
-            init_kwargs = init_configuration
-
-        # Update with newly provided kwargs
-        init_kwargs.update(kwargs)
-
-        # Set max length if needed
-        if pretrained_model_name_or_path in cls.max_model_input_sizes:
-            # if we're using a pretrained model, ensure the tokenizer
-            # wont index sequences longer than the number of positional embeddings
-            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
-            if max_len is not None and isinstance(max_len, (int, float)):
-                init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
-
-        # Merge resolved_vocab_files arguments in init_kwargs.
-        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
-        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
-        for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in init_kwargs:
-                init_kwargs[args_name] = file_path
-        if special_tokens_map_file is not None:
-            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-                special_tokens_map = json.load(special_tokens_map_handle)
-            for key, value in special_tokens_map.items():
-                if key not in init_kwargs:
-                    init_kwargs[key] = value
-
-        # Instantiate tokenizer.
-        try:
-            tokenizer = cls(*init_inputs, **init_kwargs)
-        except OSError:
-            raise OSError(
-                "Unable to load vocabulary from file. "
-                "Please check that the provided vocabulary is accessible and not corrupted."
-            )
-
-        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
-        tokenizer.init_inputs = init_inputs
-        tokenizer.init_kwargs = init_kwargs
-
-        # update unique_added_tokens_encoder with special tokens for correct tokenization
-        tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens))
-
-        # Add supplementary tokens.
-        if added_tokens_file is not None:
-            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-                added_tok_encoder = json.load(added_tokens_handle)
-            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-            tokenizer.added_tokens_encoder.update(added_tok_encoder)
-            tokenizer.added_tokens_decoder.update(added_tok_decoder)
-            tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()))
-
-        return tokenizer
-
-    def save_pretrained(self, save_directory):
-        """ Save the tokenizer vocabulary files together with:
-                - added tokens,
-                - special-tokens-to-class-attributes-mapping,
-                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
-
-            This won't save modifications other than (added tokens and special token mapping) you may have
-            applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
-
-            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Saving directory ({}) should be a directory".format(save_directory))
-            return
-
-        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
-        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
-        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
-
-        tokenizer_config = copy.deepcopy(self.init_kwargs)
-        if len(self.init_inputs) > 0:
-            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
-        for file_id in self.vocab_files_names.keys():
-            tokenizer_config.pop(file_id, None)
-
-        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
-
-        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
-
-        if len(self.added_tokens_encoder) > 0:
-            with open(added_tokens_file, "w", encoding="utf-8") as f:
-                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
-                f.write(out_str)
-
-        vocab_files = self.save_vocabulary(save_directory)
-
-        return vocab_files + (special_tokens_map_file, added_tokens_file)
-
-    def save_vocabulary(self, save_directory):
-        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
-            and special token mappings.
-
-            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
-        raise NotImplementedError
-
-    def vocab_size(self):
-        """ Size of the base vocabulary (without the added tokens) """
-        raise NotImplementedError
-
-    def __len__(self):
-        """ Size of the full vocabulary with the added tokens """
-        return self.vocab_size + len(self.added_tokens_encoder)
-
-    def add_tokens(self, new_tokens):
-        """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-        vocabulary, they are added to it with indices starting from length of the current vocabulary.
-
-        Args:
-            new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
-
-        Returns:
-            Number of tokens added to the vocabulary.
-
-        Examples::
-
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertModel.from_pretrained('bert-base-uncased')
-
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        """
-        if not new_tokens:
-            return 0
-
-        to_add_tokens = []
-        for token in new_tokens:
-            assert isinstance(token, str)
-            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
-                token = token.lower()
-            if (
-                token != self.unk_token
-                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-                and token not in to_add_tokens
-            ):
-                to_add_tokens.append(token)
-                logger.info("Adding %s to the vocabulary", token)
-
-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        return len(to_add_tokens)
-
-    def num_added_tokens(self, pair=False):
-        """
-        Returns the number of added tokens when encoding a sequence with special tokens.
-
-        Note:
-            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
-            inside your training loop.
-
-        Args:
-            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
-                number of added tokens in the case of a single sequence if set to False.
-
-        Returns:
-            Number of tokens added to sequences
-        """
-        token_ids_0 = []
-        token_ids_1 = []
-        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
-
-    def add_special_tokens(self, special_tokens_dict):
-        """
-        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-        to class attributes. If special tokens are NOT in the vocabulary, they are added
-        to it (indexed starting from the last index of the current vocabulary).
-
-        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
-
-        - special tokens are carefully handled by the tokenizer (they are never split)
-        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
-
-        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
-
-        Args:
-            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
-                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
-                ``additional_special_tokens``].
-
-                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
-
-        Returns:
-            Number of tokens added to the vocabulary.
-
-        Examples::
-
-            # Let's see how to add a new classification token to GPT-2
-            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-            model = GPT2Model.from_pretrained('gpt2')
-
-            special_tokens_dict = {'cls_token': '<CLS>'}
-
-            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-
-            assert tokenizer.cls_token == '<CLS>'
-        """
-        if not special_tokens_dict:
-            return 0
-
-        added_tokens = 0
-        for key, value in special_tokens_dict.items():
-            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
-            if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                added_tokens += self.add_tokens(value)
-            else:
-                assert isinstance(value, str)
-                added_tokens += self.add_tokens([value])
-            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
-            setattr(self, key, value)
-
-        return added_tokens
-
-    def tokenize(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Take care of added tokens.
-
-            text: The sequence to be encoded.
-            **kwargs: passed to the child `self.tokenize()` method
-        """
-        all_special_tokens = self.all_special_tokens
-
-        def lowercase_text(t):
-            # convert non-special tokens to lowercase
-            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
-            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
-            return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
-
-        if self.init_kwargs.get("do_lower_case", False):
-            text = lowercase_text(text)
-
-        def split_on_token(tok, text):
-            result = []
-            split_text = text.split(tok)
-            for i, sub_text in enumerate(split_text):
-                sub_text = sub_text.strip()
-                if i == 0 and not sub_text:
-                    result += [tok]
-                elif i == len(split_text) - 1:
-                    if sub_text:
-                        result += [sub_text]
-                    else:
-                        pass
-                else:
-                    if sub_text:
-                        result += [sub_text]
-                    result += [tok]
-            return result
-
-        def split_on_tokens(tok_list, text):
-            if not text.strip():
-                return []
-            if not tok_list:
-                return self._tokenize(text, **kwargs)
-
-            tokenized_text = []
-            text_list = [text]
-            for tok in tok_list:
-                tokenized_text = []
-                for sub_text in text_list:
-                    if sub_text not in self.unique_added_tokens_encoder:
-                        tokenized_text += split_on_token(tok, sub_text)
-                    else:
-                        tokenized_text += [sub_text]
-                text_list = tokenized_text
-
-            return list(
-                itertools.chain.from_iterable(
-                    (
-                        self._tokenize(token, **kwargs) if token not in self.unique_added_tokens_encoder else [token]
-                        for token in tokenized_text
-                    )
-                )
-            )
-
-        added_tokens = self.unique_added_tokens_encoder
-        tokenized_text = split_on_tokens(added_tokens, text)
-        return tokenized_text
-
-    def _tokenize(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Do NOT take care of added tokens.
-        """
-        raise NotImplementedError
-
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token, or a sequence of tokens, (str) in a single integer id
-            (resp. a sequence of ids), using the vocabulary.
-        """
-        if tokens is None:
-            return None
-
-        if isinstance(tokens, str):
-            return self._convert_token_to_id_with_added_voc(tokens)
-
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        return ids
-
-    def _convert_token_to_id_with_added_voc(self, token):
-        if token is None:
-            return None
-
-        if token in self.added_tokens_encoder:
-            return self.added_tokens_encoder[token]
-        return self._convert_token_to_id(token)
-
-    def _convert_token_to_id(self, token):
-        raise NotImplementedError
-
-    def encode(
-        self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        **kwargs
-    ):
-        """
-        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-
-        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
-
-        Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            **kwargs: passed to the `self.tokenize()` method
-        """
-        encoded_inputs = self.encode_plus(
-            text,
-            text_pair=text_pair,
-            max_length=max_length,
-            add_special_tokens=add_special_tokens,
-            stride=stride,
-            truncation_strategy=truncation_strategy,
-            pad_to_max_length=pad_to_max_length,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
-
-        return encoded_inputs["input_ids"]
-
-    def encode_plus(
-        self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        **kwargs
-    ):
-        """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    attention_mask: list[int] if return_attention_mask is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
-        """
-
-        def get_input_ids(text):
-            if isinstance(text, str):
-                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                return self.convert_tokens_to_ids(text)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
-            else:
-                raise ValueError(
-                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-                )
-
-        first_ids = get_input_ids(text)
-        second_ids = get_input_ids(text_pair) if text_pair is not None else None
-
-        return self.prepare_for_model(
-            first_ids,
-            pair_ids=second_ids,
-            max_length=max_length,
-            pad_to_max_length=pad_to_max_length,
-            add_special_tokens=add_special_tokens,
-            stride=stride,
-            truncation_strategy=truncation_strategy,
-            return_tensors=return_tensors,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-        )
-
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs=None,
-        add_special_tokens=False,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        return_tensors=None,
-        return_input_lengths=False,
-        return_attention_masks=False,
-        **kwargs
-    ):
-        """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
-                This can be a list of string/string-sequences/int-sequences or a list of pair of
-                string/string-sequences/int-sequence (see details in encode_plus)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary`
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            **kwargs: passed to the `self.tokenize()` method
-        """
-        batch_outputs = {}
-        for ids_or_pair_ids in batch_text_or_text_pairs:
-            if isinstance(ids_or_pair_ids, (list, tuple)):
-                assert len(ids_or_pair_ids) == 2
-                ids, pair_ids = ids_or_pair_ids
-            else:
-                ids, pair_ids = ids_or_pair_ids, None
-            outputs = self.encode_plus(
-                ids,
-                pair_ids,
-                add_special_tokens=add_special_tokens,
-                max_length=max_length,
-                stride=stride,
-                truncation_strategy=truncation_strategy,
-                return_tensors=None,
-            )
-
-            # Append the non-padded length to the output
-            if return_input_lengths:
-                outputs["input_len"] = len(outputs["input_ids"])
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        # Compute longest sequence size
-        max_seq_len = max(map(len, batch_outputs["input_ids"]))
-
-        if return_attention_masks:
-            # Allow the model to not give any special attention to padded input
-            batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]]
-
-        if return_tensors is not None:
-
-            # Do the tensor conversion in batch
-            for key, value in batch_outputs.items():
-
-                padded_value = value
-                # verify that the tokenizer has a pad_token_id
-                if key != "input_len" and self._pad_token is not None:
-                    # Padding handle
-                    padded_value = [
-                        v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v))
-                        for v in padded_value
-                    ]
-
-                if return_tensors == "tf" and is_tf_available():
-                    batch_outputs[key] = tf.constant(padded_value)
-                elif return_tensors == "pt" and is_torch_available():
-                    batch_outputs[key] = torch.tensor(padded_value)
-                elif return_tensors is not None:
-                    logger.warning(
-                        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                            return_tensors
-                        )
-                    )
-
-        # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
-        if return_attention_masks:
-            if is_tf_available():
-                batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1)
-            else:
-                batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1)
-
-        return batch_outputs
-
-    def prepare_for_model(
-        self,
-        ids,
-        pair_ids=None,
-        max_length=None,
-        add_special_tokens=True,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-    ):
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
-        It adds special tokens, truncates
-        sequences if overflowing while taking into account the special tokens and manages a window stride for
-        overflowing tokens
-
-        Args:
-            ids: list of tokenized input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
-                list of inputs.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the following strings:
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
-        """
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-
-        encoded_inputs = {}
-
-        # Handle max sequence length
-        total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
-        if max_length and total_len > max_length:
-            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
-                ids,
-                pair_ids=pair_ids,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-            if return_overflowing_tokens:
-                encoded_inputs["overflowing_tokens"] = overflowing_tokens
-                encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Handle special_tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-
-        if return_special_tokens_mask:
-            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-
-        encoded_inputs["input_ids"] = sequence
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-
-        if max_length and len(encoded_inputs["input_ids"]) > max_length:
-            encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
-            if return_token_type_ids:
-                encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
-            if return_special_tokens_mask:
-                encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
-
-        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum sequence length "
-                "for this model ({} > {}). Running this sequence through the model will result in "
-                "indexing errors".format(len(ids), self.max_len)
-            )
-
-        needs_to_be_padded = pad_to_max_length and (
-            max_length
-            and len(encoded_inputs["input_ids"]) < max_length
-            or max_length is None
-            and len(encoded_inputs["input_ids"]) < self.max_len
-            and self.max_len <= 10000
-        )
-
-        if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning(
-                "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
-            )
-
-        if needs_to_be_padded:
-            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
-
-            if self.padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
-                if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if return_special_tokens_mask:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
-                if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if return_special_tokens_mask:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
-
-            else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-
-        elif return_attention_mask:
-            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
-
-        # Prepare inputs as tensors if asked
-        if return_tensors == "tf" and is_tf_available():
-            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
-
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
-
-        elif return_tensors == "pt" and is_torch_available():
-            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
-
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
-        elif return_tensors is not None:
-            logger.warning(
-                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors
-                )
-            )
-
-        return encoded_inputs
-
-    def truncate_sequences(
-        self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
-    ):
-        """Truncates a sequence pair in place to the maximum length.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences).
-                    Overflowing tokens only contains overflow from the first sequence.
-                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, pair_ids, []
-
-        if truncation_strategy == "longest_first":
-            overflowing_tokens = []
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    overflowing_tokens = [ids[-1]] + overflowing_tokens
-                    ids = ids[:-1]
-                else:
-                    pair_ids = pair_ids[:-1]
-            window_len = min(len(ids), stride)
-            if window_len > 0:
-                overflowing_tokens = ids[-window_len:] + overflowing_tokens
-        elif truncation_strategy == "only_first":
-            assert len(ids) > num_tokens_to_remove
-            window_len = min(len(ids), stride + num_tokens_to_remove)
-            overflowing_tokens = ids[-window_len:]
-            ids = ids[:-num_tokens_to_remove]
-        elif truncation_strategy == "only_second":
-            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
-            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-            overflowing_tokens = pair_ids[-window_len:]
-            pair_ids = pair_ids[:-num_tokens_to_remove]
-        elif truncation_strategy == "do_not_truncate":
-            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
-        else:
-            raise ValueError(
-                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
-            )
-        return (ids, pair_ids, overflowing_tokens)
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        if token_ids_1 is None:
-            return len(token_ids_0) * [0]
-        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
-        if token_ids_1 is None:
-            return token_ids_0
-        return token_ids_0 + token_ids_1
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
-
-            Args:
-                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
-        """
-        if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
-            else:
-                return self._convert_id_to_token(ids)
-        tokens = []
-        for index in ids:
-            index = int(index)
-            if skip_special_tokens and index in self.all_special_ids:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
-        return tokens
-
-    def _convert_id_to_token(self, index):
-        raise NotImplementedError
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string.
-            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
-            but we often want to remove sub-word tokenization artifacts at the same time.
-        """
-        return " ".join(self.convert_ids_to_tokens(tokens))
-
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """
-        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-        with options to remove special tokens and clean up tokenization spaces.
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
-
-        Args:
-            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
-            skip_special_tokens: if set to True, will replace special tokens.
-            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
-        """
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separatly for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
-                continue
-            if token in self.added_tokens_encoder:
-                if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = " ".join(sub_texts)
-
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
-
-    @property
-    def special_tokens_map(self):
-        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
-
-    @property
-    def all_special_tokens(self):
-        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
-        all_toks = []
-        set_attr = self.special_tokens_map
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(set(all_toks))
-        return all_toks
-
-    @property
-    def all_special_ids(self):
-        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
-        all_toks = self.all_special_tokens
-        all_ids = self.convert_tokens_to_ids(all_toks)
-        return all_ids
-
-    @staticmethod
-    def clean_up_tokenization(out_string):
-        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
-        """
-        out_string = (
-            out_string.replace(" .", ".")
-            .replace(" ?", "?")
-            .replace(" !", "!")
-            .replace(" ,", ",")
-            .replace(" ' ", "'")
-            .replace(" n't", "n't")
-            .replace(" 'm", "'m")
-            .replace(" do not", " don't")
-            .replace(" 's", "'s")
-            .replace(" 've", "'ve")
-            .replace(" 're", "'re")
-        )
-        return out_string
-
-
-class PreTrainedTokenizerFast(PreTrainedTokenizer):
-    _tokenizer = None
-    _decoder = None
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            raise NotImplementedError
-        return self._tokenizer
-
-    @property
-    def decoder(self):
-        if self._decoder is None:
-            raise NotImplementedError
-        return self._decoder
-
-    @property
-    def vocab_size(self):
-        return self.tokenizer.get_vocab_size(with_added_tokens=False)
-
-    def __len__(self):
-        return self.tokenizer.get_vocab_size(with_added_tokens=True)
-
-    @PreTrainedTokenizer.bos_token.setter
-    def bos_token(self, value):
-        self._bos_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.eos_token.setter
-    def eos_token(self, value):
-        self._eos_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.unk_token.setter
-    def unk_token(self, value):
-        self._unk_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.sep_token.setter
-    def sep_token(self, value):
-        self._sep_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.pad_token.setter
-    def pad_token(self, value):
-        self._pad_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.cls_token.setter
-    def cls_token(self, value):
-        self._cls_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.mask_token.setter
-    def mask_token(self, value):
-        self._mask_token = value
-        self._update_special_tokens()
-
-    @PreTrainedTokenizer.additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-        self._update_special_tokens()
-
-    def _update_special_tokens(self):
-        if self._tokenizer is not None:
-            self._tokenizer.add_special_tokens(self.all_special_tokens)
-
-    @staticmethod
-    def _convert_encoding(
-        encoding,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-    ):
-        encoding_dict = {
-            "input_ids": encoding.ids,
-        }
-        if return_token_type_ids:
-            encoding_dict["token_type_ids"] = encoding.type_ids
-        if return_attention_mask:
-            encoding_dict["attention_mask"] = encoding.attention_mask
-        if return_overflowing_tokens:
-            overflowing = encoding.overflowing
-            encoding_dict["overflowing_tokens"] = overflowing.ids if overflowing is not None else []
-        if return_special_tokens_mask:
-            encoding_dict["special_tokens_mask"] = encoding.special_tokens_mask
-
-        # Prepare inputs as tensors if asked
-        if return_tensors == "tf" and is_tf_available():
-            encoding_dict["input_ids"] = tf.constant([encoding_dict["input_ids"]])
-            if "token_type_ids" in encoding_dict:
-                encoding_dict["token_type_ids"] = tf.constant([encoding_dict["token_type_ids"]])
-
-            if "attention_mask" in encoding_dict:
-                encoding_dict["attention_mask"] = tf.constant([encoding_dict["attention_mask"]])
-
-        elif return_tensors == "pt" and is_torch_available():
-            encoding_dict["input_ids"] = torch.tensor([encoding_dict["input_ids"]])
-            if "token_type_ids" in encoding_dict:
-                encoding_dict["token_type_ids"] = torch.tensor([encoding_dict["token_type_ids"]])
-
-            if "attention_mask" in encoding_dict:
-                encoding_dict["attention_mask"] = torch.tensor([encoding_dict["attention_mask"]])
-        elif return_tensors is not None:
-            logger.warning(
-                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors
-                )
-            )
-
-        return encoding_dict
-
-    def encode_plus(
-        self,
-        text,
-        text_pair=None,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        **kwargs
-    ):
-        encoding = self.tokenizer.encode(text, text_pair)
-        return self._convert_encoding(
-            encoding,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-        )
-
-    def tokenize(self, text):
-        return self.tokenizer.encode(text).tokens
-
-    def _convert_token_to_id_with_added_voc(self, token):
-        id = self.tokenizer.token_to_id(token)
-        if id is None:
-            return self.unk_token_id
-        return id
-
-    def _convert_id_to_token(self, index):
-        return self.tokenizer.id_to_token(int(index))
-
-    def convert_tokens_to_string(self, tokens):
-        return self.decoder.decode(tokens)
-
-    def add_tokens(self, new_tokens):
-        self.tokenizer.add_tokens(new_tokens)
-
-    def add_special_tokens(self, special_tokens_dict):
-        added = super().add_special_tokens(special_tokens_dict)
-        self._update_special_tokens()
-        return added
-
-    def encode_batch(
-        self,
-        texts,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-    ):
-        return [
-            self._convert_encoding(
-                encoding,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-            )
-            for encoding in self.tokenizer.encode_batch(texts)
-        ]
-
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = self.tokenizer.decode(token_ids, skip_special_tokens)
-
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
-
-    def decode_batch(self, ids_batch, skip_special_tokens=False, clear_up_tokenization_spaces=True):
-        return [
-            self.clean_up_tokenization(text) if clear_up_tokenization_spaces else text
-            for text in self.tokenizer.decode_batch(ids_batch, skip_special_tokens)
-        ]
diff --git a/server/transformers/src/transformers/tokenization_xlm.py b/server/transformers/src/transformers/tokenization_xlm.py
deleted file mode 100644
index 518f3dd7ffbff955830e07be02a561d53e3a060e..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_xlm.py
+++ /dev/null
@@ -1,892 +0,0 @@
-# coding=utf-8
-# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for XLM."""
-
-
-import json
-import logging
-import os
-import re
-import sys
-import unicodedata
-
-import sacremoses as sm
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
-        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
-        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
-        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
-        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
-        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
-        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
-        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
-        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
-        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
-    },
-    "merges_file": {
-        "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
-        "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
-        "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
-        "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
-        "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
-        "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
-        "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
-        "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlm-mlm-en-2048": 512,
-    "xlm-mlm-ende-1024": 512,
-    "xlm-mlm-enfr-1024": 512,
-    "xlm-mlm-enro-1024": 512,
-    "xlm-mlm-tlm-xnli15-1024": 512,
-    "xlm-mlm-xnli15-1024": 512,
-    "xlm-clm-enfr-1024": 512,
-    "xlm-clm-ende-1024": 512,
-    "xlm-mlm-17-1280": 512,
-    "xlm-mlm-100-1280": 512,
-}
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
-    "xlm-mlm-ende-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {"0": "de", "1": "en"},
-        "lang2id": {"de": 0, "en": 1},
-    },
-    "xlm-mlm-enfr-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {"0": "en", "1": "fr"},
-        "lang2id": {"en": 0, "fr": 1},
-    },
-    "xlm-mlm-enro-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {"0": "en", "1": "ro"},
-        "lang2id": {"en": 0, "ro": 1},
-    },
-    "xlm-mlm-tlm-xnli15-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {
-            "0": "ar",
-            "1": "bg",
-            "2": "de",
-            "3": "el",
-            "4": "en",
-            "5": "es",
-            "6": "fr",
-            "7": "hi",
-            "8": "ru",
-            "9": "sw",
-            "10": "th",
-            "11": "tr",
-            "12": "ur",
-            "13": "vi",
-            "14": "zh",
-        },
-        "lang2id": {
-            "ar": 0,
-            "bg": 1,
-            "de": 2,
-            "el": 3,
-            "en": 4,
-            "es": 5,
-            "fr": 6,
-            "hi": 7,
-            "ru": 8,
-            "sw": 9,
-            "th": 10,
-            "tr": 11,
-            "ur": 12,
-            "vi": 13,
-            "zh": 14,
-        },
-    },
-    "xlm-mlm-xnli15-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {
-            "0": "ar",
-            "1": "bg",
-            "2": "de",
-            "3": "el",
-            "4": "en",
-            "5": "es",
-            "6": "fr",
-            "7": "hi",
-            "8": "ru",
-            "9": "sw",
-            "10": "th",
-            "11": "tr",
-            "12": "ur",
-            "13": "vi",
-            "14": "zh",
-        },
-        "lang2id": {
-            "ar": 0,
-            "bg": 1,
-            "de": 2,
-            "el": 3,
-            "en": 4,
-            "es": 5,
-            "fr": 6,
-            "hi": 7,
-            "ru": 8,
-            "sw": 9,
-            "th": 10,
-            "tr": 11,
-            "ur": 12,
-            "vi": 13,
-            "zh": 14,
-        },
-    },
-    "xlm-clm-enfr-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {"0": "en", "1": "fr"},
-        "lang2id": {"en": 0, "fr": 1},
-    },
-    "xlm-clm-ende-1024": {
-        "do_lowercase_and_remove_accent": True,
-        "id2lang": {"0": "de", "1": "en"},
-        "lang2id": {"de": 0, "en": 1},
-    },
-    "xlm-mlm-17-1280": {
-        "do_lowercase_and_remove_accent": False,
-        "id2lang": {
-            "0": "ar",
-            "1": "de",
-            "2": "en",
-            "3": "es",
-            "4": "fr",
-            "5": "hi",
-            "6": "it",
-            "7": "ja",
-            "8": "ko",
-            "9": "nl",
-            "10": "pl",
-            "11": "pt",
-            "12": "ru",
-            "13": "sv",
-            "14": "tr",
-            "15": "vi",
-            "16": "zh",
-        },
-        "lang2id": {
-            "ar": 0,
-            "de": 1,
-            "en": 2,
-            "es": 3,
-            "fr": 4,
-            "hi": 5,
-            "it": 6,
-            "ja": 7,
-            "ko": 8,
-            "nl": 9,
-            "pl": 10,
-            "pt": 11,
-            "ru": 12,
-            "sv": 13,
-            "tr": 14,
-            "vi": 15,
-            "zh": 16,
-        },
-    },
-    "xlm-mlm-100-1280": {
-        "do_lowercase_and_remove_accent": False,
-        "id2lang": {
-            "0": "af",
-            "1": "als",
-            "2": "am",
-            "3": "an",
-            "4": "ang",
-            "5": "ar",
-            "6": "arz",
-            "7": "ast",
-            "8": "az",
-            "9": "bar",
-            "10": "be",
-            "11": "bg",
-            "12": "bn",
-            "13": "br",
-            "14": "bs",
-            "15": "ca",
-            "16": "ceb",
-            "17": "ckb",
-            "18": "cs",
-            "19": "cy",
-            "20": "da",
-            "21": "de",
-            "22": "el",
-            "23": "en",
-            "24": "eo",
-            "25": "es",
-            "26": "et",
-            "27": "eu",
-            "28": "fa",
-            "29": "fi",
-            "30": "fr",
-            "31": "fy",
-            "32": "ga",
-            "33": "gan",
-            "34": "gl",
-            "35": "gu",
-            "36": "he",
-            "37": "hi",
-            "38": "hr",
-            "39": "hu",
-            "40": "hy",
-            "41": "ia",
-            "42": "id",
-            "43": "is",
-            "44": "it",
-            "45": "ja",
-            "46": "jv",
-            "47": "ka",
-            "48": "kk",
-            "49": "kn",
-            "50": "ko",
-            "51": "ku",
-            "52": "la",
-            "53": "lb",
-            "54": "lt",
-            "55": "lv",
-            "56": "mk",
-            "57": "ml",
-            "58": "mn",
-            "59": "mr",
-            "60": "ms",
-            "61": "my",
-            "62": "nds",
-            "63": "ne",
-            "64": "nl",
-            "65": "nn",
-            "66": "no",
-            "67": "oc",
-            "68": "pl",
-            "69": "pt",
-            "70": "ro",
-            "71": "ru",
-            "72": "scn",
-            "73": "sco",
-            "74": "sh",
-            "75": "si",
-            "76": "simple",
-            "77": "sk",
-            "78": "sl",
-            "79": "sq",
-            "80": "sr",
-            "81": "sv",
-            "82": "sw",
-            "83": "ta",
-            "84": "te",
-            "85": "th",
-            "86": "tl",
-            "87": "tr",
-            "88": "tt",
-            "89": "uk",
-            "90": "ur",
-            "91": "uz",
-            "92": "vi",
-            "93": "war",
-            "94": "wuu",
-            "95": "yi",
-            "96": "zh",
-            "97": "zh_classical",
-            "98": "zh_min_nan",
-            "99": "zh_yue",
-        },
-        "lang2id": {
-            "af": 0,
-            "als": 1,
-            "am": 2,
-            "an": 3,
-            "ang": 4,
-            "ar": 5,
-            "arz": 6,
-            "ast": 7,
-            "az": 8,
-            "bar": 9,
-            "be": 10,
-            "bg": 11,
-            "bn": 12,
-            "br": 13,
-            "bs": 14,
-            "ca": 15,
-            "ceb": 16,
-            "ckb": 17,
-            "cs": 18,
-            "cy": 19,
-            "da": 20,
-            "de": 21,
-            "el": 22,
-            "en": 23,
-            "eo": 24,
-            "es": 25,
-            "et": 26,
-            "eu": 27,
-            "fa": 28,
-            "fi": 29,
-            "fr": 30,
-            "fy": 31,
-            "ga": 32,
-            "gan": 33,
-            "gl": 34,
-            "gu": 35,
-            "he": 36,
-            "hi": 37,
-            "hr": 38,
-            "hu": 39,
-            "hy": 40,
-            "ia": 41,
-            "id": 42,
-            "is": 43,
-            "it": 44,
-            "ja": 45,
-            "jv": 46,
-            "ka": 47,
-            "kk": 48,
-            "kn": 49,
-            "ko": 50,
-            "ku": 51,
-            "la": 52,
-            "lb": 53,
-            "lt": 54,
-            "lv": 55,
-            "mk": 56,
-            "ml": 57,
-            "mn": 58,
-            "mr": 59,
-            "ms": 60,
-            "my": 61,
-            "nds": 62,
-            "ne": 63,
-            "nl": 64,
-            "nn": 65,
-            "no": 66,
-            "oc": 67,
-            "pl": 68,
-            "pt": 69,
-            "ro": 70,
-            "ru": 71,
-            "scn": 72,
-            "sco": 73,
-            "sh": 74,
-            "si": 75,
-            "simple": 76,
-            "sk": 77,
-            "sl": 78,
-            "sq": 79,
-            "sr": 80,
-            "sv": 81,
-            "sw": 82,
-            "ta": 83,
-            "te": 84,
-            "th": 85,
-            "tl": 86,
-            "tr": 87,
-            "tt": 88,
-            "uk": 89,
-            "ur": 90,
-            "uz": 91,
-            "vi": 92,
-            "war": 93,
-            "wuu": 94,
-            "yi": 95,
-            "zh": 96,
-            "zh_classical": 97,
-            "zh_min_nan": 98,
-            "zh_yue": 99,
-        },
-    },
-}
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-    word is represented as tuple of symbols (symbols being variable-length strings)
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-def lowercase_and_remove_accent(text):
-    """
-    Lowercase and strips accents from a piece of text based on
-    https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
-    """
-    text = " ".join(text)
-    text = text.lower()
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-        cat = unicodedata.category(char)
-        if cat == "Mn":
-            continue
-        output.append(char)
-    return "".join(output).lower().split(" ")
-
-
-def replace_unicode_punct(text):
-    """
-    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
-    """
-    text = text.replace("，", ",")
-    text = re.sub(r"。\s*", ". ", text)
-    text = text.replace("、", ",")
-    text = text.replace("”", '"')
-    text = text.replace("“", '"')
-    text = text.replace("∶", ":")
-    text = text.replace("：", ":")
-    text = text.replace("？", "?")
-    text = text.replace("《", '"')
-    text = text.replace("》", '"')
-    text = text.replace("）", ")")
-    text = text.replace("！", "!")
-    text = text.replace("（", "(")
-    text = text.replace("；", ";")
-    text = text.replace("１", "1")
-    text = text.replace("」", '"')
-    text = text.replace("「", '"')
-    text = text.replace("０", "0")
-    text = text.replace("３", "3")
-    text = text.replace("２", "2")
-    text = text.replace("５", "5")
-    text = text.replace("６", "6")
-    text = text.replace("９", "9")
-    text = text.replace("７", "7")
-    text = text.replace("８", "8")
-    text = text.replace("４", "4")
-    text = re.sub(r"．\s*", ". ", text)
-    text = text.replace("～", "~")
-    text = text.replace("’", "'")
-    text = text.replace("…", "...")
-    text = text.replace("━", "-")
-    text = text.replace("〈", "<")
-    text = text.replace("〉", ">")
-    text = text.replace("【", "[")
-    text = text.replace("】", "]")
-    text = text.replace("％", "%")
-    return text
-
-
-def remove_non_printing_char(text):
-    """
-    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
-    """
-    output = []
-    for char in text:
-        cat = unicodedata.category(char)
-        if cat.startswith("C"):
-            continue
-        output.append(char)
-    return "".join(output)
-
-
-def romanian_preprocessing(text):
-    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
-    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
-    text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
-    text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
-    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
-    text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
-    text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
-    text = text.replace("\u0102", "A").replace("\u0103", "a")
-    text = text.replace("\u00C2", "A").replace("\u00E2", "a")
-    text = text.replace("\u00CE", "I").replace("\u00EE", "i")
-    return text
-
-
-class XLMTokenizer(PreTrainedTokenizer):
-    """
-    BPE tokenizer for XLM
-
-        - Moses preprocessing & tokenization for most supported languages
-
-        - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
-
-        - (optionally) lower case & normalize all inputs text
-
-        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
-        (ex: "__classify__") to a vocabulary
-
-        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
-
-        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
-
-        - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        unk_token="<unk>",
-        bos_token="<s>",
-        sep_token="</s>",
-        pad_token="<pad>",
-        cls_token="</s>",
-        mask_token="<special1>",
-        additional_special_tokens=[
-            "<special0>",
-            "<special1>",
-            "<special2>",
-            "<special3>",
-            "<special4>",
-            "<special5>",
-            "<special6>",
-            "<special7>",
-            "<special8>",
-            "<special9>",
-        ],
-        lang2id=None,
-        id2lang=None,
-        do_lowercase_and_remove_accent=True,
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-
-        # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = dict()
-        # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = dict()
-        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
-        # True for current supported model (v1.2.0), False for XLM-17 & 100
-        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
-        self.lang2id = lang2id
-        self.id2lang = id2lang
-        if lang2id is not None and id2lang is not None:
-            assert len(lang2id) == len(id2lang)
-
-        self.ja_word_tokenizer = None
-        self.zh_word_tokenizer = None
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[:-1]
-        merges = [tuple(merge.split()[:2]) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    def moses_punct_norm(self, text, lang):
-        if lang not in self.cache_moses_punct_normalizer:
-            punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
-            self.cache_moses_punct_normalizer[lang] = punct_normalizer
-        else:
-            punct_normalizer = self.cache_moses_punct_normalizer[lang]
-        return punct_normalizer.normalize(text)
-
-    def moses_tokenize(self, text, lang):
-        if lang not in self.cache_moses_tokenizer:
-            moses_tokenizer = sm.MosesTokenizer(lang=lang)
-            self.cache_moses_tokenizer[lang] = moses_tokenizer
-        else:
-            moses_tokenizer = self.cache_moses_tokenizer[lang]
-        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
-
-    def moses_pipeline(self, text, lang):
-        text = replace_unicode_punct(text)
-        text = self.moses_punct_norm(text, lang)
-        text = remove_non_printing_char(text)
-        return text
-
-    def ja_tokenize(self, text):
-        if self.ja_word_tokenizer is None:
-            try:
-                import Mykytea
-
-                self.ja_word_tokenizer = Mykytea.Mykytea(
-                    "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~")
-                )
-            except (AttributeError, ImportError):
-                logger.error(
-                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
-                )
-                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
-                logger.error("2. autoreconf -i")
-                logger.error("3. ./configure --prefix=$HOME/local")
-                logger.error("4. make && make install")
-                logger.error("5. pip install kytea")
-                raise
-        return list(self.ja_word_tokenizer.getWS(text))
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + "</w>",)
-        if token in self.cache:
-            return self.cache[token]
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token + "</w>"
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        if word == "\n  </w>":
-            word = "\n</w>"
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
-        """
-        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
-
-        Details of tokenization:
-        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
-            - Install with `pip install sacremoses`
-        - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
-            - Install with `pip install pythainlp`
-        - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
-            - Install with the following steps:
-            ```
-            git clone git@github.com:neubig/kytea.git && cd kytea
-            autoreconf -i
-            ./configure --prefix=$HOME/local
-            make && make install
-            pip install kytea
-            ```
-        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
-            - Install with `pip install jieba`
-
-        (*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
-        However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
-        Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
-        if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
-        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
-        and set `bypass_tokenizer=True` to bypass the tokenizer.
-
-        Args:
-            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
-            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
-
-        Returns:
-            List of tokens.
-        """
-        if lang and self.lang2id and lang not in self.lang2id:
-            logger.error(
-                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
-            )
-        if bypass_tokenizer:
-            text = text.split()
-        elif lang not in self.lang_with_custom_tokenizer:
-            text = self.moses_pipeline(text, lang=lang)
-            # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
-            if lang == "ro":
-                text = romanian_preprocessing(text)
-            text = self.moses_tokenize(text, lang=lang)
-        elif lang == "th":
-            text = self.moses_pipeline(text, lang=lang)
-            try:
-                if "pythainlp" not in sys.modules:
-                    from pythainlp.tokenize import word_tokenize as th_word_tokenize
-                else:
-                    th_word_tokenize = sys.modules["pythainlp"].word_tokenize
-            except (AttributeError, ImportError):
-                logger.error(
-                    "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
-                )
-                logger.error("1. pip install pythainlp")
-                raise
-            text = th_word_tokenize(text)
-        elif lang == "zh":
-            try:
-                if "jieba" not in sys.modules:
-                    import jieba
-                else:
-                    jieba = sys.modules["jieba"]
-            except (AttributeError, ImportError):
-                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
-                logger.error("1. pip install jieba")
-                raise
-            text = " ".join(jieba.cut(text))
-            text = self.moses_pipeline(text, lang=lang)
-            text = text.split()
-        elif lang == "ja":
-            text = self.moses_pipeline(text, lang=lang)
-            text = self.ja_tokenize(text)
-        else:
-            raise ValueError("It should not reach here")
-
-        if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
-            text = lowercase_and_remove_accent(text)
-
-        split_tokens = []
-        for token in text:
-            if token:
-                split_tokens.extend([t for t in self.bpe(token).split(" ")])
-
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = "".join(tokens).replace("</w>", " ").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A XLM sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s> B </s>
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0,))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        An XLM sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, save_directory):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
diff --git a/server/transformers/src/transformers/tokenization_xlm_roberta.py b/server/transformers/src/transformers/tokenization_xlm_roberta.py
deleted file mode 100644
index ea39d945ae78fd703f05392ecdf8910805e10324..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_xlm_roberta.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-""" Tokenization classes for XLM-RoBERTa model."""
-
-
-import logging
-import os
-from shutil import copyfile
-
-from transformers.tokenization_utils import PreTrainedTokenizer
-
-from .tokenization_xlnet import SPIECE_UNDERLINE
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
-        "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlm-roberta-base": 512,
-    "xlm-roberta-large": 512,
-    "xlm-roberta-large-finetuned-conll02-dutch": 512,
-    "xlm-roberta-large-finetuned-conll02-spanish": 512,
-    "xlm-roberta-large-finetuned-conll03-english": 512,
-    "xlm-roberta-large-finetuned-conll03-german": 512,
-}
-
-
-class XLMRobertaTokenizer(PreTrainedTokenizer):
-    """
-        Adapted from RobertaTokenizer and XLNetTokenizer
-        SentencePiece based tokenizer. Peculiarities:
-
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        eos_token="</s>",
-        sep_token="</s>",
-        cls_token="<s>",
-        unk_token="<unk>",
-        pad_token="<pad>",
-        mask_token="<mask>",
-        **kwargs
-    ):
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
-
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
-
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
-
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + len(self.fairseq_tokens_to_ids)
-
-    def _tokenize(self, text):
-        return self.sp_model.EncodeAsPieces(text)
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        return self.sp_model.PieceToId(token) + self.fairseq_offset
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/server/transformers/src/transformers/tokenization_xlnet.py b/server/transformers/src/transformers/tokenization_xlnet.py
deleted file mode 100644
index e3ebc7107244f3c5258f7f59c6227023a1317b65..0000000000000000000000000000000000000000
--- a/server/transformers/src/transformers/tokenization_xlnet.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization classes for XLNet model."""
-
-
-import logging
-import os
-import unicodedata
-from shutil import copyfile
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
-        "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlnet-base-cased": None,
-    "xlnet-large-cased": None,
-}
-
-SPIECE_UNDERLINE = "▁"
-
-# Segments (not really needed)
-SEG_ID_A = 0
-SEG_ID_B = 1
-SEG_ID_CLS = 2
-SEG_ID_SEP = 3
-SEG_ID_PAD = 4
-
-
-class XLNetTokenizer(PreTrainedTokenizer):
-    """
-        SentencePiece based tokenizer. Peculiarities:
-
-            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    padding_side = "left"
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=False,
-        remove_space=True,
-        keep_accents=False,
-        bos_token="<s>",
-        eos_token="</s>",
-        unk_token="<unk>",
-        sep_token="<sep>",
-        pad_token="<pad>",
-        cls_token="<cls>",
-        mask_token="<mask>",
-        additional_special_tokens=["<eop>", "<eod>"],
-        **kwargs
-    ):
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-        self._pad_token_type_id = 3
-
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        try:
-            import sentencepiece as spm
-        except ImportError:
-            logger.warning(
-                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
-                "pip install sentencepiece"
-            )
-            raise
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-        outputs = outputs.replace("``", '"').replace("''", '"')
-
-        if not self.keep_accents:
-            outputs = unicodedata.normalize("NFKD", outputs)
-            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text, sample=False):
-        """ Tokenize a string. """
-        text = self.preprocess_text(text)
-
-        if not sample:
-            pieces = self.sp_model.EncodeAsPieces(text)
-        else:
-            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-        new_pieces = []
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
-                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        An XLNet sequence has the following format:
-            single sequence: X <sep> <cls>
-            pair of sequences: A <sep> B <sep> <cls>
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return token_ids_0 + sep + cls
-        return token_ids_0 + sep + token_ids_1 + sep + cls
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
-        return ([0] * len(token_ids_0)) + [1, 1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        An XLNet sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
-        | first sequence    | second sequence     | CLS segment ID
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls_segment_id = [2]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + sep) * [0] + cls_segment_id
-        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
-
-    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/server/transformers/templates/adding_a_new_example_script/README.md b/server/transformers/templates/adding_a_new_example_script/README.md
deleted file mode 100644
index 2afca08bf8456375c2bca786ce28a5605ada2b31..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_example_script/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# How to add a new example script in 🤗Transformers
-
-This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
-
-Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
diff --git a/server/transformers/templates/adding_a_new_example_script/run_xxx.py b/server/transformers/templates/adding_a_new_example_script/run_xxx.py
deleted file mode 100644
index 6de065ce65ce57729f02cf6fc593a028d27b1dae..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_example_script/run_xxx.py
+++ /dev/null
@@ -1,724 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning the library models for task XXX."""
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForQuestionAnswering,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
-    XLMConfig,
-    XLMForQuestionAnswering,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from utils_squad import (
-    RawResult,
-    RawResultExtended,
-    convert_examples_to_features,
-    read_squad_examples,
-    write_predictions,
-    write_predictions_extended,
-)
-
-# The follwing import is the official SQuAD evaluation script (2.0).
-# You can remove it from the dependencies if you are using this script outside of the library
-# We've added it here for automated tests (see examples/test_examples.py file)
-from utils_squad_evaluate import EVAL_OPTS
-from utils_squad_evaluate import main as evaluate_on_squad
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def to_list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    all_results = []
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-        with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
-            example_indices = batch[3]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-            outputs = model(**inputs)
-
-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-            if args.model_type in ["xlnet", "xlm"]:
-                # XLNet uses a more complex post-processing procedure
-                result = RawResultExtended(
-                    unique_id=unique_id,
-                    start_top_log_probs=to_list(outputs[0][i]),
-                    start_top_index=to_list(outputs[1][i]),
-                    end_top_log_probs=to_list(outputs[2][i]),
-                    end_top_index=to_list(outputs[3][i]),
-                    cls_logits=to_list(outputs[4][i]),
-                )
-            else:
-                result = RawResult(
-                    unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i])
-                )
-            all_results.append(result)
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    else:
-        output_null_log_odds_file = None
-
-    if args.model_type in ["xlnet", "xlm"]:
-        # XLNet uses a more complex post-processing procedure
-        write_predictions_extended(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.predict_file,
-            model.config.start_n_top,
-            model.config.end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        write_predictions(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-        )
-
-    # Evaluate with the official SQuAD script
-    evaluate_options = EVAL_OPTS(
-        data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file
-    )
-    results = evaluate_on_squad(evaluate_options)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset,
-        # and the others will use the cache
-
-    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(
-        os.path.dirname(input_file),
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_squad_examples(
-            input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative
-        )
-        features = convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset,
-        # and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-    if evaluate:
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(
-            all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask
-        )
-    else:
-        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(
-            all_input_ids,
-            all_input_mask,
-            all_segment_ids,
-            all_start_positions,
-            all_end_positions,
-            all_cls_index,
-            all_p_mask,
-        )
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json"
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        required=True,
-        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-        "longer than this will be truncated, and sequences shorter than this will be padded.",
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help="The maximum number of tokens for the question. Questions longer than this will "
-        "be truncated to this length.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help="The maximum length of an answer that can be generated. This is needed because the start "
-        "and end predictions are not conditioned on one another.",
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help="If true, all of the warnings related to data processing will be printed. "
-        "A number of warnings are expected for a normal SQuAD evaluation.",
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will
-        # download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will
-        # download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum
-    # if args.fp16 is set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations.
-    # Note that running `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/server/transformers/templates/adding_a_new_example_script/utils_xxx.py b/server/transformers/templates/adding_a_new_example_script/utils_xxx.py
deleted file mode 100644
index b8f8cdf2b962c061722aadaad0a7ae3dab88ce8b..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_example_script/utils_xxx.py
+++ /dev/null
@@ -1,1005 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Load XXX dataset. """
-
-
-import collections
-import json
-import logging
-import math
-
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-
-# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
-from utils_squad_evaluate import find_all_best_thresh_v2, get_raw_scores, make_qid_to_has_ans
-
-
-logger = logging.getLogger(__name__)
-
-
-class SquadExample(object):
-    """
-    A single training/test example for the Squad dataset.
-    For examples without an answer, the start and end position are -1.
-    """
-
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        doc_tokens,
-        orig_answer_text=None,
-        start_position=None,
-        end_position=None,
-        is_impossible=None,
-    ):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.end_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.is_impossible:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(
-        self,
-        unique_id,
-        example_index,
-        doc_span_index,
-        tokens,
-        token_to_orig_map,
-        token_is_max_context,
-        input_ids,
-        input_mask,
-        segment_ids,
-        cls_index,
-        p_mask,
-        paragraph_len,
-        start_position=None,
-        end_position=None,
-        is_impossible=None,
-    ):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-        self.paragraph_len = paragraph_len
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file, is_training, version_2_with_negative):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r", encoding="utf-8") as reader:
-        input_data = json.load(reader)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError("For training, each question should have exactly 1 answer.")
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
-                        cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible,
-                )
-                examples.append(example)
-    return examples
-
-
-def convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_seq_length,
-    doc_stride,
-    max_query_length,
-    is_training,
-    cls_token_at_end=False,
-    cls_token="[CLS]",
-    sep_token="[SEP]",
-    pad_token=0,
-    sequence_a_segment_id=0,
-    sequence_b_segment_id=1,
-    cls_token_segment_id=0,
-    pad_token_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-    # cnt_pos, cnt_neg = 0, 0
-    # max_N, max_M = 1024, 1024
-    # f = np.zeros((max_N, max_M), dtype=np.float32)
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-
-        # if example_index % 100 == 0:
-        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
-
-        query_tokens = tokenizer.tokenize(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
-            )
-
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])  # pylint: disable=invalid-name
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-
-            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-            # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = []
-
-            # CLS token at the beginning
-            if not cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = 0
-
-            # Query
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(sequence_a_segment_id)
-                p_mask.append(1)
-
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_a_segment_id)
-            p_mask.append(1)
-
-            # Paragraph
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(sequence_b_segment_id)
-                p_mask.append(0)
-            paragraph_len = doc_span.length
-
-            # SEP token
-            tokens.append(sep_token)
-            segment_ids.append(sequence_b_segment_id)
-            p_mask.append(1)
-
-            # CLS token at the end
-            if cls_token_at_end:
-                tokens.append(cls_token)
-                segment_ids.append(cls_token_segment_id)
-                p_mask.append(0)
-                cls_index = len(tokens) - 1  # Index of classification token
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(pad_token)
-                input_mask.append(0 if mask_padding_with_zero else 1)
-                segment_ids.append(pad_token_segment_id)
-                p_mask.append(1)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            span_is_impossible = example.is_impossible
-            start_position = None
-            end_position = None
-            if is_training and not span_is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    span_is_impossible = True
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-
-            if is_training and span_is_impossible:
-                start_position = cls_index
-                end_position = cls_index
-
-            if example_index < 20:
-                logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info(
-                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
-                )
-                logger.info(
-                    "token_is_max_context: %s"
-                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
-                )
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and span_is_impossible:
-                    logger.info("impossible example")
-                if is_training and not span_is_impossible:
-                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
-                    logger.info("start_position: %d" % (start_position))
-                    logger.info("end_position: %d" % (end_position))
-                    logger.info("answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    cls_index=cls_index,
-                    p_mask=p_mask,
-                    paragraph_len=paragraph_len,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=span_is_impossible,
-                )
-            )
-            unique_id += 1
-
-    return features
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
-
-
-def write_predictions(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    verbose_logging,
-    version_2_with_negative,
-    null_score_diff_threshold,
-):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    logger.info("Writing predictions to: %s" % (output_prediction_file))
-    logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
-    )
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index],
-                        )
-                    )
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit,
-                )
-            )
-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"]
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
-
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest) == 1:
-                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
-
-
-# For XLNet (and XLM which uses the same head)
-RawResultExtended = collections.namedtuple(
-    "RawResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits"],
-)
-
-
-def write_predictions_extended(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    orig_data_file,
-    start_n_top,
-    end_n_top,
-    version_2_with_negative,
-    tokenizer,
-    verbose_logging,
-):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
-
-        Requires utils_squad_evaluate.py
-    """
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
-    )
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
-    )
-
-    logger.info("Writing predictions to: %s", output_prediction_file)
-    # logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-
-            cur_null_score = result.cls_logits
-
-            # if we could have irrelevant answers, get the min score of irrelevant
-            score_null = min(score_null, cur_null_score)
-
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_log_prob = result.start_top_log_probs[i]
-                    start_index = result.start_top_index[i]
-
-                    j_index = i * end_n_top + j
-
-                    end_log_prob = result.end_top_log_probs[j_index]
-                    end_index = result.end_top_index[j_index]
-
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= feature.paragraph_len - 1:
-                        continue
-                    if end_index >= feature.paragraph_len - 1:
-                        continue
-
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob,
-                        )
-                    )
-
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-
-            # XLNet un-tokenizer
-            # Let's keep it simple for now and see if we need all this later.
-            #
-            # tok_start_to_orig_index = feature.tok_start_to_orig_index
-            # tok_end_to_orig_index = feature.tok_end_to_orig_index
-            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
-            # paragraph_text = example.paragraph_text
-            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
-
-            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
-
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
-            )
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_log_prob + entry.end_log_prob)
-            if not best_non_null_entry:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_log_prob"] = entry.start_log_prob
-            output["end_log_prob"] = entry.end_log_prob
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-        assert best_non_null_entry is not None
-
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        # note(zhiliny): always predict best_non_null_entry
-        # and the evaluation script will search for the best threshold
-        all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    with open(orig_data_file, "r", encoding="utf-8") as reader:
-        orig_data = json.load(reader)["data"]
-
-    qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
-    out_eval = {}
-
-    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
-
-    return out_eval
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
diff --git a/server/transformers/templates/adding_a_new_model/README.md b/server/transformers/templates/adding_a_new_model/README.md
deleted file mode 100644
index 5397ca4c789817bbb244bcfbd7679adc9381f8d2..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# How to add a new model in 🤗Transformers
-
-This folder describes the process to add a new model in 🤗Transformers and provide templates for the required files.
-
-The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository.
-
-One important point though is that the library has the following goals impacting the way models are incorporated:
-
-- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
-- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
-
-For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
-
-# Typical workflow for including a model
-
-Here an overview of the general workflow: 
-
-- [ ] add model/configuration/tokenization classes
-- [ ] add conversion scripts
-- [ ] add tests
-- [ ] finalize
-
-Let's detail what should be done at each step
-
-## Adding model/configuration/tokenization classes
-
-Here is the workflow for adding model/configuration/tokenization classes:
-
-- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
-- [ ] edit the files to replace `XXX` (with various casing) with your model name
-- [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file
-- [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
-- [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file
-
-# Adding conversion scripts
-
-Here is the workflow for the conversion scripts:
-
-- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
-- [ ] edit this script to convert your original checkpoint weights to the current pytorch ones.
-
-# Adding tests:
-
-Here is the workflow for the adding tests:
-
-- [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name,
-- [ ] edit the tests files to replace `XXX` (with various casing) with your model name
-- [ ] edit the tests code as needed
-
-# Final steps
-
-You can then finish the addition step by adding imports for your classes in the common files:
-
-- [ ] add import for all the relevant classes in `__init__.py`
-- [ ] add your configuration in `configuration_auto.py`
-- [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`
-- [ ] add your tokenizer in `tokenization_auto.py`
-- [ ] add your models and tokenizer to `pipeline.py`
-- [ ] add a link to your conversion script in the main conversion utility (in `commands/convert.py`)
-- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
-- [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
-- [ ] upload the pretrained weigths, configurations and vocabulary files.
diff --git a/server/transformers/templates/adding_a_new_model/configuration_xxx.py b/server/transformers/templates/adding_a_new_model/configuration_xxx.py
deleted file mode 100644
index d23bce43d2f43bf6cda25feea3197a9ddfc56f01..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/configuration_xxx.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# coding=utf-8
-# Copyright 2010, XXX authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" XXX model configuration """
-
-
-import logging
-
-from .configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
-    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
-}
-
-
-class XxxConfig(PretrainedConfig):
-    r"""
-        :class:`~transformers.XxxConfig` is the configuration class to store the configuration of a
-        `XxxModel`.
-
-
-        Arguments:
-            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `XxxModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-    """
-    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "xxx"
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        n_positions=1024,
-        n_ctx=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.n_ctx = n_ctx
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-
-    @property
-    def max_position_embeddings(self):
-        return self.n_positions
-
-    @property
-    def hidden_size(self):
-        return self.n_embd
-
-    @property
-    def num_attention_heads(self):
-        return self.n_head
-
-    @property
-    def num_hidden_layers(self):
-        return self.n_layer
diff --git a/server/transformers/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/server/transformers/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index b57d3bbdcaeacce796833750f259f5f809beca58..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert XXX checkpoint."""
-
-
-import argparse
-import logging
-
-import torch
-
-from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
-
-
-logging.basicConfig(level=logging.INFO)
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = XxxConfig.from_json_file(config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = XxxForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_xxx(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \n"
-        "This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/server/transformers/templates/adding_a_new_model/modeling_tf_xxx.py b/server/transformers/templates/adding_a_new_model/modeling_tf_xxx.py
deleted file mode 100644
index 4e3791e481d9900bbe5d6454a7483440de642885..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/modeling_tf_xxx.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" TF 2.0 XXX model. """
-
-####################################################
-# In this template, replace all the XXX (various casings) with your model name
-####################################################
-
-
-import logging
-
-import tensorflow as tf
-
-from .configuration_xxx import XxxConfig
-from .file_utils import add_start_docstrings
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
-
-
-logger = logging.getLogger(__name__)
-
-####################################################
-# This dict contrains shortcut names and associated url
-# for the pretrained weights provided with the models
-####################################################
-TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
-    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
-}
-
-
-####################################################
-# TF 2.0 Models are constructed using Keras imperative API by sub-classing
-# - tf.keras.layers.Layer for the layers and
-# - TFPreTrainedModel for the models (itself a sub-class of tf.keras.Model)
-####################################################
-
-####################################################
-# Here is an example of typical layer in a TF 2.0 model of the library
-# The classes are usually identical to the PyTorch ones and prefixed with 'TF'.
-#
-# Note that class __init__ parameters includes **kwargs (send to 'super').
-# This let us have a control on class scope and variable names:
-# More precisely, we set the names of the class attributes (lower level layers) to
-# to the equivalent attributes names in the PyTorch model so we can have equivalent
-# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other.
-#
-# See the conversion methods in modeling_tf_pytorch_utils.py for more details
-####################################################
-
-TFXxxAttention = tf.keras.layers.Layer
-
-TFXxxIntermediate = tf.keras.layers.Layer
-
-TFXxxOutput = tf.keras.layers.Layer
-
-
-class TFXxxLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.attention = TFXxxAttention(config, name="attention")
-        self.intermediate = TFXxxIntermediate(config, name="intermediate")
-        self.transformer_output = TFXxxOutput(config, name="output")
-
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
-
-        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.transformer_output([intermediate_output, attention_output], training=training)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-####################################################
-# The full model without a specific pretrained or finetuning head is
-# provided as a tf.keras.layers.Layer usually called "TFXxxMainLayer"
-####################################################
-class TFXxxMainLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
-
-    def _prune_heads(self, heads_to_prune):
-        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
-
-    def call(
-        self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False
-    ):
-        # We allow three types of multi-inputs:
-        # - traditional keyword arguments in the call method
-        # - all the arguments provided as a dict in the first positional argument of call
-        # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
-        # The last two options are useful to use the tf.keras fit() method.
-
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            assert len(inputs) <= 5, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            assert len(inputs) <= 5, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if attention_mask is None:
-            attention_mask = tf.fill(shape_list(input_ids), 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(shape_list(input_ids), 0)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
-
-        ##################################
-        # Replace this with your model code
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
-        sequence_output = encoder_outputs[0]
-        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-
-        return outputs  # sequence_output, (hidden_states), (attentions)
-
-
-####################################################
-# TFXxxPreTrainedModel is a sub-class of tf.keras.Model
-# which take care of loading and saving pretrained weights
-# and various common utilities.
-# Here you just need to specify a few (self-explanatory)
-# pointers for your model.
-####################################################
-class TFXxxPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XxxConfig
-    pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "transformer"
-
-
-XXX_START_DOCSTRING = r"""    The XXX model was proposed in
-    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
-    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-    pre-trained using a combination of masked language modeling objective and next sentence prediction
-    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
-    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
-
-    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
-        https://arxiv.org/abs/1810.04805
-
-    .. _`tf.keras.Model`:
-        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
-
-    Note on the model inputs:
-        TF 2.0 models accepts two formats as inputs:
-
-            - having all inputs as keyword arguments (like PyTorch models), or
-            - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
-
-        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
-            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
-
-    Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XXX_INPUTS_DOCSTRING = r"""
-    Inputs:
-        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-                ``token_type_ids:   0   0   0   0  0     0   0``
-
-            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
-            the right rather than the left.
-
-            Indices can be obtained using :class:`transformers.XxxTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
-            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class TFXxxModel(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Xxx pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxModel
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxModel.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name="transformer")
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
-
-
-TFXxxMLMHead = tf.keras.layers.Layer
-
-
-@add_start_docstrings(
-    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
-)
-class TFXxxForMaskedLM(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForMaskedLM
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-
-        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        return outputs  # prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForSequenceClassification
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class TFXxxForTokenClassification(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
-            Classification scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForTokenClassification
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForQuestionAnswering
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name="transformer")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-        )
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/templates/adding_a_new_model/modeling_xxx.py b/server/transformers/templates/adding_a_new_model/modeling_xxx.py
deleted file mode 100644
index f9f4daa9506fc9731b03b326444d63aa45a27be5..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/modeling_xxx.py
+++ /dev/null
@@ -1,749 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch XXX model. """
-
-####################################################
-# In this template, replace all the XXX (various casings) with your model name
-####################################################
-
-
-import logging
-import os
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from .configuration_xxx import XxxConfig
-from .file_utils import add_start_docstrings
-from .modeling_utils import PreTrainedModel
-
-
-logger = logging.getLogger(__name__)
-
-####################################################
-# This dict contrains shortcut names and associated url
-# for the pretrained weights provided with the models
-####################################################
-XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
-    "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
-}
-
-
-####################################################
-# This is a conversion method from TF 1.0 to PyTorch
-# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-####################################################
-def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (itself a sub-class of torch.nn.Module)
-####################################################
-
-####################################################
-# Here is an example of typical layer in a PyTorch model of the library
-# The classes are usually identical to the TF 2.0 ones without the 'TF' prefix.
-#
-# See the conversion methods in modeling_tf_pytorch_utils.py for more details
-####################################################
-
-XxxAttention = nn.Module
-
-XxxIntermediate = nn.Module
-
-XxxOutput = nn.Module
-
-
-class XxxLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = XxxAttention(config)
-        self.intermediate = XxxIntermediate(config)
-        self.output = XxxOutput(config)
-
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-####################################################
-# PreTrainedModel is a sub-class of torch.nn.Module
-# which take care of loading and saving pretrained weights
-# and various common utilities.
-#
-# Here you just need to specify a few (self-explanatory)
-# pointers for your model and the weights initialization
-# method if its not fully covered by PreTrainedModel's default method
-####################################################
-
-XxxLayerNorm = torch.nn.LayerNorm
-
-XxxEmbeddings = nn.Module
-
-XxxEncoder = nn.Module
-
-XxxPooler = nn.Module
-
-
-class XxxPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = XxxConfig
-    pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_xxx
-    base_model_prefix = "transformer"
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, XxxLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-XXX_START_DOCSTRING = r"""    The XXX model was proposed in
-    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
-    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-    pre-trained using a combination of masked language modeling objective and next sentence prediction
-    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
-    refer to the PyTorch documentation for all matter related to general usage and behavior.
-
-    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
-        https://arxiv.org/abs/1810.04805
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
-    Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-XXX_INPUTS_DOCSTRING = r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-                ``token_type_ids:   0   0   0   0  0     0   0``
-
-            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
-            the right rather than the left.
-
-            Indices can be obtained using :class:`transformers.XxxTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
-            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-"""
-
-
-@add_start_docstrings(
-    "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class XxxModel(XxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Xxx pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = XxxModel.from_pretrained('xxx-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embeddings = XxxEmbeddings(config)
-        self.encoder = XxxEncoder(config)
-        self.pooler = XxxPooler(config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embeddings.word_embeddings = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-    ):
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        ##################################
-        # Replace this with your model code
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
-        sequence_output = encoder_outputs[0]
-        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-
-        return outputs  # sequence_output, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
-)
-class XxxForMaskedLM(XxxPreTrainedModel):
-    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = XxxForMaskedLM.from_pretrained('xxx-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = XxxModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-    ):
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss,) + outputs
-
-        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class XxxForSequenceClassification(XxxPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = XxxForSequenceClassification.from_pretrained('xxx-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XxxModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class XxxForTokenClassification(XxxPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss.
-        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
-            Classification scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = XxxForTokenClassification.from_pretrained('xxx-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XxxModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING,
-    XXX_INPUTS_DOCSTRING,
-)
-class XxxForQuestionAnswering(XxxPreTrainedModel):
-    r"""
-        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = XxxForQuestionAnswering.from_pretrained('xxx-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
-        input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
-        # a nice puppet
-
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = XxxModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/server/transformers/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/server/transformers/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py
deleted file mode 100644
index 3e12b3f745997f149d7c635e07670ecf234e05d9..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import XxxConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    from transformers.modeling_tf_xxx import (
-        TFXxxModel,
-        TFXxxForMaskedLM,
-        TFXxxForSequenceClassification,
-        TFXxxForTokenClassification,
-        TFXxxForQuestionAnswering,
-    )
-
-
-@require_tf
-class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            TFXxxModel,
-            TFXxxForMaskedLM,
-            TFXxxForQuestionAnswering,
-            TFXxxForSequenceClassification,
-            TFXxxForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-
-    class TFXxxModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = XxxConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_xxx_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFXxxModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output, pooled_output = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-
-        def create_and_check_xxx_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFXxxForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_xxx_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFXxxForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def create_and_check_xxx_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFXxxForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def create_and_check_xxx_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFXxxForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFXxxModelTest.TFXxxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xxx_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["xxx-base-uncased"]:
-            model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/templates/adding_a_new_model/tests/test_modeling_xxx.py b/server/transformers/templates/adding_a_new_model/tests/test_modeling_xxx.py
deleted file mode 100644
index 281a9226fc25490aba3030fcf18dc8d417b4958a..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/tests/test_modeling_xxx.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        XxxConfig,
-        XxxModel,
-        XxxForMaskedLM,
-        XxxForQuestionAnswering,
-        XxxForSequenceClassification,
-        XxxForTokenClassification,
-    )
-    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class XxxModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification)
-        if is_torch_available()
-        else ()
-    )
-
-    class XxxModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = XxxConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_xxx_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = XxxModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_xxx_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = XxxForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_xxx_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = XxxForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_xxx_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = XxxForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_xxx_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = XxxForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = XxxModelTest.XxxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xxx_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/templates/adding_a_new_model/tests/test_tokenization_xxx.py b/server/transformers/templates/adding_a_new_model/tests/test_tokenization_xxx.py
deleted file mode 100644
index 1a24f76b0fb1327c41be50117db59b8c572ef74f..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/tests/test_tokenization_xxx.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = XxxTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
diff --git a/server/transformers/templates/adding_a_new_model/tokenization_xxx.py b/server/transformers/templates/adding_a_new_model/tokenization_xxx.py
deleted file mode 100644
index 667a130a9bf5a7d99c8bae4ec52b739d39377092..0000000000000000000000000000000000000000
--- a/server/transformers/templates/adding_a_new_model/tokenization_xxx.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2018 XXX Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization class for model XXX."""
-
-
-import collections
-import logging
-import os
-
-from .tokenization_utils import PreTrainedTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-####################################################
-# In this template, replace all the XXX (various casings) with your model name
-####################################################
-
-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to file names for serializing Tokenizer instances
-####################################################
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-####################################################
-# Mapping from the keyword arguments names of Tokenizer `__init__`
-# to pretrained vocabulary URL for all the model shortcut names.
-####################################################
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
-        "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
-    }
-}
-
-####################################################
-# Mapping from model shortcut names to max length of inputs
-####################################################
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xxx-base-uncased": 512,
-    "xxx-large-uncased": 512,
-}
-
-####################################################
-# Mapping from model shortcut names to a dictionary of additional
-# keyword arguments for Tokenizer `__init__`.
-# To be used for checkpoint specific configurations.
-####################################################
-PRETRAINED_INIT_CONFIGURATION = {
-    "xxx-base-uncased": {"do_lower_case": True},
-    "xxx-large-uncased": {"do_lower_case": True},
-}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-class XxxTokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a XxxTokenizer.
-    :class:`~transformers.XxxTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
-
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs
-    ):
-        """Constructs a XxxTokenizer.
-
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-        """
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def _tokenize(self, text):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
diff --git a/server/transformers/tests/__init__.py b/server/transformers/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/server/transformers/tests/fixtures/dummy-config.json b/server/transformers/tests/fixtures/dummy-config.json
deleted file mode 100644
index e388bdf71151db7c014ae6e0174dd07c1a6acbee..0000000000000000000000000000000000000000
--- a/server/transformers/tests/fixtures/dummy-config.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "model_type": "roberta"
-}
\ No newline at end of file
diff --git a/server/transformers/tests/fixtures/empty.txt b/server/transformers/tests/fixtures/empty.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/server/transformers/tests/fixtures/input.txt b/server/transformers/tests/fixtures/input.txt
deleted file mode 100644
index d1e3f410d07833e4c5c233ffd54f8d2b54ebb7cf..0000000000000000000000000000000000000000
--- a/server/transformers/tests/fixtures/input.txt
+++ /dev/null
@@ -1 +0,0 @@
-Who was Jim Henson ? ||| Jim Henson was a puppeteer
diff --git a/server/transformers/tests/fixtures/sample_text.txt b/server/transformers/tests/fixtures/sample_text.txt
deleted file mode 100644
index a42812060c576bae870eb29b1ac083fda0d239d3..0000000000000000000000000000000000000000
--- a/server/transformers/tests/fixtures/sample_text.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
-Text should be one-sentence-per-line, with empty lines between documents.
-This sample text is public domain and was randomly selected from Project Guttenberg.
-
-The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
-Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
-Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
-"Cass" Beard had risen early that morning, but not with a view to discovery.
-A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
-The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
-This was nearly opposite.
-Mr. Cassius crossed the highway, and stopped suddenly.
-Something glittered in the nearest red pool before him.
-Gold, surely!
-But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
-Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
-Like most of his fellow gold-seekers, Cass was superstitious.
-
-The fountain of classic wisdom, Hypatia herself.
-As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
-From my youth I felt in me a soul above the matter-entangled herd.
-She revealed to me the glorious fact, that I am a spark of Divinity itself.
-A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
-There is a philosophic pleasure in opening one's treasures to the modest young.
-Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
-Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
-but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
-Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
-His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
-while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
-At last they reached the quay at the opposite end of the street;
-and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
-He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/server/transformers/tests/fixtures/spiece.model b/server/transformers/tests/fixtures/spiece.model
deleted file mode 100644
index c91b8acfa56ccfc80e1cdd854ddcaf9b6c44ab2a..0000000000000000000000000000000000000000
Binary files a/server/transformers/tests/fixtures/spiece.model and /dev/null differ
diff --git a/server/transformers/tests/fixtures/test_sentencepiece.model b/server/transformers/tests/fixtures/test_sentencepiece.model
deleted file mode 100644
index 376dda73010c6f93acfa3b974bea81a9ac9e1740..0000000000000000000000000000000000000000
Binary files a/server/transformers/tests/fixtures/test_sentencepiece.model and /dev/null differ
diff --git a/server/transformers/tests/test_configuration_auto.py b/server/transformers/tests/test_configuration_auto.py
deleted file mode 100644
index 5262be2e7cccd5ee1143dc0388e8ea0ff0eedb11..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_configuration_auto.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from transformers.configuration_auto import CONFIG_MAPPING, AutoConfig
-from transformers.configuration_bert import BertConfig
-from transformers.configuration_roberta import RobertaConfig
-
-from .utils import DUMMY_UNKWOWN_IDENTIFIER
-
-
-SAMPLE_ROBERTA_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy-config.json")
-
-
-class AutoConfigTest(unittest.TestCase):
-    def test_config_from_model_shortcut(self):
-        config = AutoConfig.from_pretrained("bert-base-uncased")
-        self.assertIsInstance(config, BertConfig)
-
-    def test_config_model_type_from_local_file(self):
-        config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_config_model_type_from_model_identifier(self):
-        config = AutoConfig.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_config_for_model_str(self):
-        config = AutoConfig.for_model("roberta")
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_pattern_matching_fallback(self):
-        """
-        In cases where config.json doesn't include a model_type,
-        perform a few safety checks on the config mapping's order.
-        """
-        # no key string should be included in a later key string (typical failure case)
-        keys = list(CONFIG_MAPPING.keys())
-        for i, key in enumerate(keys):
-            self.assertFalse(any(key in later_key for later_key in keys[i + 1 :]))
diff --git a/server/transformers/tests/test_configuration_common.py b/server/transformers/tests/test_configuration_common.py
deleted file mode 100644
index 471f0f012d549ae17bf4cefd12d0d91ea230857e..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_configuration_common.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-
-
-class ConfigTester(object):
-    def __init__(self, parent, config_class=None, **kwargs):
-        self.parent = parent
-        self.config_class = config_class
-        self.inputs_dict = kwargs
-
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "vocab_size"))
-        self.parent.assertTrue(hasattr(config, "hidden_size"))
-        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
-
-    def create_and_test_config_to_json_string(self):
-        config = self.config_class(**self.inputs_dict)
-        obj = json.loads(config.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.parent.assertEqual(obj[key], value)
-
-    def create_and_test_config_to_json_file(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "config.json")
-            config_first.to_json_file(json_file_path)
-            config_second = self.config_class.from_json_file(json_file_path)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_from_and_save_pretrained(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            config_first.save_pretrained(tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def run_common_tests(self):
-        self.create_and_test_config_common_properties()
-        self.create_and_test_config_to_json_string()
-        self.create_and_test_config_to_json_file()
-        self.create_and_test_config_from_and_save_pretrained()
diff --git a/server/transformers/tests/test_doc_samples.py b/server/transformers/tests/test_doc_samples.py
deleted file mode 100644
index c97af35200ac4b38875d4be4e33b379221b92b99..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_doc_samples.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-from typing import List, Union
-
-from .utils import require_tf, require_torch, slow
-
-
-def get_examples_from_file(file):
-    examples = []
-    example = []
-    example_mode = False
-    example_indentation = None
-    for i, line in enumerate(file):
-        if example_mode:
-            current_indentation = len(line) - len(line.strip()) - 1
-
-            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
-            empty_line = example_indentation == 0 and len(line) == 1
-
-            # If we're back to the example indentation or if it's the end of the docstring.
-            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
-                # Exit the example mode and add the example to the examples list
-                example_mode = False
-                example_indentation = None
-                examples.append(example)
-                example = []
-            else:
-                # If line is not empty, add it to the current example
-                if line != "\n":
-                    example.append(line[example_indentation + 4 : -1])
-
-        # Detect the example from '::' or 'example::'
-        if "example::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("example::")
-        elif "examples::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("examples::")
-        # elif "::" in line.lower() and len(line.strip()) == 2:
-        #     example_mode = True
-        #     example_indentation = line.lower().find("::")
-
-    examples = ["\n".join(example) for example in examples]
-    examples = [example for example in examples if "not runnable" not in example.lower()]
-
-    return examples
-
-
-@require_torch
-@require_tf
-@slow
-class TestCodeExamples(unittest.TestCase):
-    def analyze_directory(
-        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
-    ):
-        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
-
-        if identifier is not None:
-            files = [file for file in files if identifier in file]
-
-        if ignore_files is not None:
-            files = [file for file in files if file not in ignore_files]
-
-        for file in files:
-            # Open all files
-            with open(os.path.join(directory, file)) as f:
-                # Retrieve examples
-                examples = get_examples_from_file(f)
-                joined_examples = []
-
-                def execute_example(code_example):
-                    exec(code_example, {})
-
-                # Some examples are the continuation of others.
-                if len(examples) > 0:
-                    joined_examples.append(examples[0])
-                    joined_examples_index = 0
-                    for example in examples[1:]:
-                        # If they contain this line, then they're a continuation of the previous script
-                        if "# Continuation of the previous script" in example:
-                            joined_examples[joined_examples_index] += "\n" + example
-                        # If not, create a new example and increment the index
-                        else:
-                            joined_examples.append(example)
-                            joined_examples_index += 1
-
-                print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
-
-                # Execute sub tests with every example.
-                for index, code_example in enumerate(joined_examples):
-                    with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
-                        execute_example(code_example)
-
-    def test_configuration_examples(self):
-        transformers_directory = "src/transformers"
-        configuration_files = "configuration"
-        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
-        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
-
-    def test_main_doc_examples(self):
-        doc_directory = "docs/source"
-        self.analyze_directory(doc_directory)
-
-    def test_modeling_examples(self):
-        transformers_directory = "src/transformers"
-        modeling_files = "modeling"
-        ignore_files = [
-            "modeling_auto.py",
-            "modeling_t5.py",
-            "modeling_tf_auto.py",
-            "modeling_utils.py",
-            "modeling_tf_t5.py",
-        ]
-        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
diff --git a/server/transformers/tests/test_hf_api.py b/server/transformers/tests/test_hf_api.py
deleted file mode 100644
index c791390959cd3599d148fec0a13205591decda28..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_hf_api.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import time
-import unittest
-
-import requests
-from requests.exceptions import HTTPError
-
-from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
-
-
-USER = "__DUMMY_TRANSFORMERS_USER__"
-PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILES = [
-    (
-        "Test-{}.txt".format(int(time.time())),
-        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"),
-    ),
-    (
-        "yoyo {}.txt".format(int(time.time())),  # space is intentional
-        os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"),
-    ),
-]
-
-
-class HfApiCommonTest(unittest.TestCase):
-    _api = HfApi(endpoint="https://moon-staging.huggingface.co")
-
-
-class HfApiLoginTest(HfApiCommonTest):
-    def test_login_invalid(self):
-        with self.assertRaises(HTTPError):
-            self._api.login(username=USER, password="fake")
-
-    def test_login_valid(self):
-        token = self._api.login(username=USER, password=PASS)
-        self.assertIsInstance(token, str)
-
-
-class HfApiEndpointsTest(HfApiCommonTest):
-    @classmethod
-    def setUpClass(cls):
-        """
-        Share this valid token in all tests below.
-        """
-        cls._token = cls._api.login(username=USER, password=PASS)
-
-    @classmethod
-    def tearDownClass(cls):
-        for FILE_KEY, FILE_PATH in FILES:
-            cls._api.delete_obj(token=cls._token, filename=FILE_KEY)
-
-    def test_whoami(self):
-        user = self._api.whoami(token=self._token)
-        self.assertEqual(user, USER)
-
-    def test_presign(self):
-        for FILE_KEY, FILE_PATH in FILES:
-            urls = self._api.presign(token=self._token, filename=FILE_KEY)
-            self.assertIsInstance(urls, PresignedUrl)
-            self.assertEqual(urls.type, "text/plain")
-
-    def test_presign_and_upload(self):
-        for FILE_KEY, FILE_PATH in FILES:
-            access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
-            self.assertIsInstance(access_url, str)
-            with open(FILE_PATH, "r") as f:
-                body = f.read()
-            r = requests.get(access_url)
-            self.assertEqual(r.text, body)
-
-    def test_list_objs(self):
-        objs = self._api.list_objs(token=self._token)
-        self.assertIsInstance(objs, list)
-        if len(objs) > 0:
-            o = objs[-1]
-            self.assertIsInstance(o, S3Obj)
-
-
-class HfFolderTest(unittest.TestCase):
-    def test_token_workflow(self):
-        """
-        Test the whole token save/get/delete workflow,
-        with the desired behavior with respect to non-existent tokens.
-        """
-        token = "token-{}".format(int(time.time()))
-        HfFolder.save_token(token)
-        self.assertEqual(HfFolder.get_token(), token)
-        HfFolder.delete_token()
-        HfFolder.delete_token()
-        # ^^ not an error, we test that the
-        # second call does not fail.
-        self.assertEqual(HfFolder.get_token(), None)
diff --git a/server/transformers/tests/test_model_card.py b/server/transformers/tests/test_model_card.py
deleted file mode 100644
index 1004642a92a2a6253da5cc91a05ac5c3545ffed9..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_model_card.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-import unittest
-
-from transformers.modelcard import ModelCard
-
-
-class ModelCardTester(unittest.TestCase):
-    def setUp(self):
-        self.inputs_dict = {
-            "model_details": {
-                "Organization": "testing",
-                "Model date": "today",
-                "Model version": "v2.1, Developed by Test Corp in 2019.",
-                "Architecture": "Convolutional Neural Network.",
-            },
-            "metrics": "BLEU and ROUGE-1",
-            "evaluation_data": {
-                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1"},
-                "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
-            },
-            "training_data": {
-                "Dataset": "English Wikipedia dump dated 2018-12-01",
-                "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf",
-            },
-            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76},
-        }
-
-    def test_model_card_common_properties(self):
-        modelcard = ModelCard.from_dict(self.inputs_dict)
-        self.assertTrue(hasattr(modelcard, "model_details"))
-        self.assertTrue(hasattr(modelcard, "intended_use"))
-        self.assertTrue(hasattr(modelcard, "factors"))
-        self.assertTrue(hasattr(modelcard, "metrics"))
-        self.assertTrue(hasattr(modelcard, "evaluation_data"))
-        self.assertTrue(hasattr(modelcard, "training_data"))
-        self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
-        self.assertTrue(hasattr(modelcard, "ethical_considerations"))
-        self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
-
-    def test_model_card_to_json_string(self):
-        modelcard = ModelCard.from_dict(self.inputs_dict)
-        obj = json.loads(modelcard.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.assertEqual(obj[key], value)
-
-    def test_model_card_to_json_file(self):
-        model_card_first = ModelCard.from_dict(self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, "modelcard.json")
-            model_card_first.to_json_file(filename)
-            model_card_second = ModelCard.from_json_file(filename)
-
-        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
-
-    def test_model_card_from_and_save_pretrained(self):
-        model_card_first = ModelCard.from_dict(self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model_card_first.save_pretrained(tmpdirname)
-            model_card_second = ModelCard.from_pretrained(tmpdirname)
-
-        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
diff --git a/server/transformers/tests/test_modeling_albert.py b/server/transformers/tests/test_modeling_albert.py
deleted file mode 100644
index 05d7aaefb5014a16b5908ddff8e1194011f49d46..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_albert.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        AlbertConfig,
-        AlbertModel,
-        AlbertForMaskedLM,
-        AlbertForSequenceClassification,
-        AlbertForQuestionAnswering,
-    )
-    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
-
-    class AlbertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            embedding_size=16,
-            hidden_size=36,
-            num_hidden_layers=6,
-            num_hidden_groups=6,
-            num_attention_heads=6,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.embedding_size = embedding_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-            self.num_hidden_groups = num_hidden_groups
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = AlbertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-                num_hidden_groups=self.num_hidden_groups,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_albert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_albert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = AlbertForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_albert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = AlbertForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = AlbertModelTest.AlbertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_albert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_auto.py b/server/transformers/tests/test_modeling_auto.py
deleted file mode 100644
index b39c9de5228df75c9ee1eca08592fe917a380500..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_auto.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_torch, slow
-
-
-if is_torch_available():
-    from transformers import (
-        AutoConfig,
-        BertConfig,
-        AutoModel,
-        BertModel,
-        AutoModelForPreTraining,
-        BertForPreTraining,
-        AutoModelWithLMHead,
-        BertForMaskedLM,
-        RobertaForMaskedLM,
-        AutoModelForSequenceClassification,
-        BertForSequenceClassification,
-        AutoModelForQuestionAnswering,
-        BertForQuestionAnswering,
-    )
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    from transformers.modeling_auto import (
-        MODEL_MAPPING,
-        MODEL_FOR_PRETRAINING_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_WITH_LM_HEAD_MAPPING,
-    )
-
-
-@require_torch
-class AutoModelTest(unittest.TestCase):
-    @slow
-    def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = AutoModel.from_pretrained(model_name)
-            model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertModel)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-
-    @slow
-    def test_model_for_pretraining_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = AutoModelForPreTraining.from_pretrained(model_name)
-            model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForPreTraining)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-
-    @slow
-    def test_lmhead_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = AutoModelWithLMHead.from_pretrained(model_name)
-            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForMaskedLM)
-
-    @slow
-    def test_sequence_classification_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            model, loading_info = AutoModelForSequenceClassification.from_pretrained(
-                model_name, output_loading_info=True
-            )
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForSequenceClassification)
-
-    # @slow
-    def test_question_answering_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForQuestionAnswering)
-
-    def test_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
-        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(model, BertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14830)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
-
-    def test_from_identifier_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
-        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
-        self.assertIsInstance(model, RobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14830)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
-
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (
-            MODEL_MAPPING,
-            MODEL_FOR_PRETRAINING_MAPPING,
-            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-            MODEL_WITH_LM_HEAD_MAPPING,
-        )
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    with self.subTest(
-                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
-                    ):
-                        self.assertFalse(issubclass(child_config, parent_config))
-                        self.assertFalse(issubclass(child_model, parent_model))
diff --git a/server/transformers/tests/test_modeling_bert.py b/server/transformers/tests/test_modeling_bert.py
deleted file mode 100644
index 946246ea2e32c0d4ad35d417df35682b522a5e74..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_bert.py
+++ /dev/null
@@ -1,477 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        BertConfig,
-        BertModel,
-        BertForMaskedLM,
-        BertForNextSentencePrediction,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        BertForMultipleChoice,
-    )
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class BertModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            BertModel,
-            BertForMaskedLM,
-            BertForNextSentencePrediction,
-            BertForPreTraining,
-            BertForQuestionAnswering,
-            BertForSequenceClassification,
-            BertForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-
-    class BertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = BertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                is_decoder=False,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def prepare_config_and_inputs_for_decoder(self):
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = self.prepare_config_and_inputs()
-
-            config.is_decoder = True
-            encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-            encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            return (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-                encoder_hidden_states,
-                encoder_attention_mask,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_bert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = BertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_bert_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ):
-            model = BertModel(config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-            )
-            sequence_output, pooled_output = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                encoder_hidden_states=encoder_hidden_states,
-            )
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_bert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = BertForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_bert_model_for_masked_lm_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ):
-            model = BertForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                masked_lm_labels=token_labels,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-            )
-            loss, prediction_scores = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                masked_lm_labels=token_labels,
-                encoder_hidden_states=encoder_hidden_states,
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_next_sequence_prediction(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = BertForNextSentencePrediction(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, seq_relationship_score = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                next_sentence_label=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "seq_relationship_score": seq_relationship_score,
-            }
-            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = BertForPreTraining(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores, seq_relationship_score = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                masked_lm_labels=token_labels,
-                next_sentence_label=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-                "seq_relationship_score": seq_relationship_score,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2])
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = BertForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids,
-                attention_mask=input_mask,
-                token_type_ids=token_type_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = BertForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = BertForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_bert_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = BertForMultipleChoice(config=config)
-            model.to(torch_device)
-            model.eval()
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(
-                multiple_choice_inputs_ids,
-                attention_mask=multiple_choice_input_mask,
-                token_type_ids=multiple_choice_token_type_ids,
-                labels=choice_labels,
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = BertModelTest.BertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_bert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_model(*config_and_inputs)
-
-    def test_bert_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_bert_model_as_decoder(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
-
-    def test_for_masked_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_bert_model_for_masked_lm_as_decoder(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = BertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_common.py b/server/transformers/tests/test_modeling_common.py
deleted file mode 100644
index a5d69fbd6c196096b55b28afdbeb6a4404c02a97..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_common.py
+++ /dev/null
@@ -1,659 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import logging
-import os.path
-import random
-import tempfile
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-    import numpy as np
-
-    from transformers import (
-        AdaptiveEmbedding,
-        PretrainedConfig,
-        PreTrainedModel,
-        BertModel,
-        BertConfig,
-        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key:
-            setattr(configs_no_init, key, 0.0)
-    return configs_no_init
-
-
-@require_torch
-class ModelTesterMixin:
-
-    model_tester = None
-    all_model_classes = ()
-    test_torchscript = True
-    test_pruning = True
-    test_resize_embeddings = True
-    test_head_masking = True
-    is_encoder_decoder = False
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            out_2 = outputs[0].numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    after_outputs = model(**inputs_dict)
-
-                # Make sure we don't have nans
-                out_1 = after_outputs[0].cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        param.data.mean().item(),
-                        [0.0, 1.0],
-                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class),
-                    )
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                first = model(**inputs_dict)[0]
-                second = model(**inputs_dict)[0]
-            out_1 = first.cpu().numpy()
-            out_2 = second.cpu().numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        decoder_seq_length = (
-            self.model_tester.decoder_seq_length
-            if hasattr(self.model_tester, "decoder_seq_length")
-            else self.model_tester.seq_length
-        )
-        encoder_seq_length = (
-            self.model_tester.encoder_seq_length
-            if hasattr(self.model_tester, "encoder_seq_length")
-            else self.model_tester.seq_length
-        )
-        decoder_key_length = (
-            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
-        )
-        encoder_key_length = (
-            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
-        )
-
-        for model_class in self.all_model_classes:
-            config.output_attentions = True
-            config.output_hidden_states = False
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, False)
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                self.assertEqual(out_len % 2, 0)
-                decoder_attentions = outputs[(out_len // 2) - 1]
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-            # Check attention is always last and order is fine
-            config.output_attentions = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-
-            self_attentions = outputs[-1]
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    def test_torchscript(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    def test_torchscript_output_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        config.output_attentions = True
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    def test_torchscript_output_hidden_state(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        config.output_hidden_states = True
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = inputs_dict["input_ids"]  # Let's keep only input_ids
-
-            try:
-                traced_gpt2 = torch.jit.trace(model, inputs)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_gpt2, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
-
-        global_rng.seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        global_rng.seed()
-
-        config.output_attentions = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-
-            # Prepare head_mask
-            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-            head_mask = torch.ones(
-                self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device
-            )
-            head_mask[0, 0] = 0
-            head_mask[-1, :-1] = 0
-            head_mask.requires_grad_(requires_grad=True)
-            inputs = inputs_dict.copy()
-            inputs["head_mask"] = head_mask
-
-            outputs = model(**inputs)
-
-            # Test that we can get a gradient back for importance score computation
-            output = sum(t.sum() for t in outputs[0])
-            output = output.sum()
-            output.backward()
-            multihead_outputs = head_mask.grad
-
-            attentions = outputs[-1]
-
-            # Remove Nan
-            for t in attentions:
-                self.assertLess(
-                    torch.sum(torch.isnan(t)), t.numel() / 4
-                )  # Check we don't have more than 25% nans (arbitrary)
-            attentions = [
-                t.masked_fill(torch.isnan(t), 0.0) for t in attentions
-            ]  # remove them (the test is less complete)
-
-            self.assertIsNotNone(multihead_outputs)
-            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-            self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-            self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-            self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-
-    def test_head_pruning(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            config.output_attentions = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-            model.prune_heads(heads_to_prune)
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_pretrained(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            config.output_attentions = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-            model.prune_heads(heads_to_prune)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_config_init(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            config.output_attentions = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]}
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_integration(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            config.output_attentions = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {0: [0], 1: [1, 2]}
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            heads_to_prune = {0: [0], 2: [1, 2]}
-            model.prune_heads(heads_to_prune)
-
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
-
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-            config.output_attentions = False
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**inputs_dict)
-            hidden_states = outputs[-1]
-            self.assertEqual(model.config.output_attentions, False)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [
-                    self.model_tester.encoder_seq_length
-                    if hasattr(self.model_tester, "encoder_seq_length")
-                    else self.model_tester.seq_length,
-                    self.model_tester.hidden_size,
-                ],
-            )
-
-    def test_resize_tokens_embeddings(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**inputs_dict)
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            model(**inputs_dict)
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
-            model.set_input_embeddings(torch.nn.Embedding(10, 10))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
-
-    def test_tie_model_weights(self):
-        if not self.test_torchscript:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            config.torchscript = True
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            params_not_tied = list(model_not_tied.parameters())
-
-            config_tied = copy.deepcopy(config)
-            config_tied.torchscript = False
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.parameters())
-
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertGreater(len(params_not_tied), len(params_tied))
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # embeddings.weight.data.div_(2)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # decoding.weight.data.div_(4)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(config.vocab_size + 10)
-            params_tied_2 = list(model_tied.parameters())
-            self.assertGreater(len(params_not_tied), len(params_tied))
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-            # decoding.weight.data.mul_(20)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.is_encoder_decoder:
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
-        else:
-            encoder_input_ids = inputs_dict["encoder_input_ids"]
-            decoder_input_ids = inputs_dict["decoder_input_ids"]
-            del inputs_dict["encoder_input_ids"]
-            del inputs_dict["decoder_input_ids"]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs_dict["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
-                inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs_dict)
-
-
-global_rng = random.Random()
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
-
-
-@require_torch
-class ModelUtilsTest(unittest.TestCase):
-    @slow
-    def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
diff --git a/server/transformers/tests/test_modeling_ctrl.py b/server/transformers/tests/test_modeling_ctrl.py
deleted file mode 100644
index 3d1a1cb2dc728952f1ef36f667b59ccf7af1a48b..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_ctrl.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel
-
-
-@require_torch
-class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    class CTRLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = CTRLConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = CTRLModel(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, presents = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "presents": presents,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-
-        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = CTRLLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = CTRLModelTest.CTRLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_ctrl_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
-
-    def test_ctrl_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = CTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_distilbert.py b/server/transformers/tests/test_modeling_distilbert.py
deleted file mode 100644
index 96f487916660c5aacbce6eb82f1f8f1a0a8b9be3..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_distilbert.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import require_torch, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        DistilBertConfig,
-        DistilBertModel,
-        DistilBertForMaskedLM,
-        DistilBertForTokenClassification,
-        DistilBertForQuestionAnswering,
-        DistilBertForSequenceClassification,
-    )
-
-
-@require_torch
-class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-        if is_torch_available()
-        else None
-    )
-    test_pruning = True
-    test_torchscript = True
-    test_resize_embeddings = True
-    test_head_masking = True
-
-    class DistilBertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=False,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = DistilBertConfig(
-                vocab_size=self.vocab_size,
-                dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                hidden_dim=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_distilbert_model(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DistilBertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            (sequence_output,) = model(input_ids, input_mask)
-            (sequence_output,) = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_distilbert_for_masked_lm(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DistilBertForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_distilbert_for_question_answering(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = DistilBertForQuestionAnswering(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, start_logits, end_logits = model(
-                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
-            )
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_distilbert_for_sequence_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DistilBertForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
-            self.check_loss_output(result)
-
-        def create_and_check_distilbert_for_token_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = DistilBertForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_distilbert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
-
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-    #         self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_encoder_decoder.py b/server/transformers/tests/test_modeling_encoder_decoder.py
deleted file mode 100644
index ac01e7b5615f5bcc5d827e0f5bf6aa9d3337a73b..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_encoder_decoder.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Hugging Face Inc. Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch, slow
-
-
-if is_torch_available():
-    from transformers import BertModel, BertForMaskedLM, Model2Model
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class EncoderDecoderModelTest(unittest.TestCase):
-    @slow
-    def test_model2model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = Model2Model.from_pretrained(model_name)
-            self.assertIsInstance(model.encoder, BertModel)
-            self.assertIsInstance(model.decoder, BertForMaskedLM)
-            self.assertEqual(model.decoder.config.is_decoder, True)
-            self.assertEqual(model.encoder.config.is_decoder, False)
-
-    def test_model2model_from_pretrained_not_bert(self):
-        logging.basicConfig(level=logging.INFO)
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("roberta")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("distilbert")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("does-not-exist")
diff --git a/server/transformers/tests/test_modeling_gpt2.py b/server/transformers/tests/test_modeling_gpt2.py
deleted file mode 100644
index 3976c7d452a9d96281bdbd6da55c8eab824c99da..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_gpt2.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        GPT2Config,
-        GPT2Model,
-        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-        GPT2LMHeadModel,
-        GPT2DoubleHeadsModel,
-    )
-
-
-@require_torch
-class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
-
-    class GPT2ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = GPT2Config(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = GPT2Model(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, presents = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "presents": presents,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-
-        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = GPT2LMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_double_lm_head_model(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = GPT2DoubleHeadsModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-                "lm_labels": multiple_choice_inputs_ids,
-            }
-
-            loss, lm_logits, mc_logits, _ = model(**inputs)
-
-            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt2_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
-
-    def test_gpt2_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gpt2_double_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_openai.py b/server/transformers/tests/test_modeling_openai.py
deleted file mode 100644
index a2aaabb645db62fe7544fec7a1e9bb0aa608f1ef..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_openai.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        OpenAIGPTConfig,
-        OpenAIGPTModel,
-        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        OpenAIGPTLMHeadModel,
-        OpenAIGPTDoubleHeadsModel,
-    )
-
-
-@require_torch
-class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
-    )
-
-    class OpenAIGPTModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = OpenAIGPTConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTModel(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-            model(input_ids, token_type_ids=token_type_ids)
-            (sequence_output,) = model(input_ids)
-
-            result = {"sequence_output": sequence_output}
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-            model = OpenAIGPTDoubleHeadsModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
-
-            result = {"loss": loss, "lm_logits": lm_logits}
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                head_mask,
-                token_type_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_openai_gpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
-
-    def test_openai_gpt_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_openai_gpt_double_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_roberta.py b/server/transformers/tests/test_modeling_roberta.py
deleted file mode 100644
index 2a63ac232a70943937de03f2daa90667e5da6a28..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_roberta.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-    from transformers import (
-        RobertaConfig,
-        RobertaModel,
-        RobertaForMaskedLM,
-        RobertaForSequenceClassification,
-        RobertaForTokenClassification,
-    )
-    from transformers.modeling_roberta import RobertaEmbeddings
-    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
-
-    class RobertaModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = RobertaConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_roberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = RobertaModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-        def create_and_check_roberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = RobertaForMaskedLM(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, prediction_scores = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def create_and_check_roberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = RobertaForTokenClassification(config=config)
-            model.to(torch_device)
-            model.eval()
-            loss, logits = model(
-                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-            )
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
-            )
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = RobertaModelTest.RobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_roberta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """ Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = RobertaEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = model.create_position_ids_from_input_ids(input_ids)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """ Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = RobertaEmbeddings(config=config)
-
-        inputs_embeds = torch.Tensor(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-class RobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained("roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained("roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3))
diff --git a/server/transformers/tests/test_modeling_t5.py b/server/transformers/tests/test_modeling_t5.py
deleted file mode 100644
index 964d5d4afee1f524c4f820710aa7d22b772fd9c1..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_t5.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow
-
-
-if is_torch_available():
-    from transformers import T5Config, T5Model, T5WithLMHeadModel
-    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class T5ModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    is_encoder_decoder = True
-
-    class T5ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            encoder_seq_length=7,
-            decoder_seq_length=9,
-            is_training=True,
-            use_attention_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=14,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            d_ff=37,
-            relative_attention_num_buckets=8,
-            dropout_rate=0.1,
-            initializer_factor=0.002,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.encoder_seq_length = encoder_seq_length
-            self.decoder_seq_length = decoder_seq_length
-            self.is_training = is_training
-            self.use_attention_mask = use_attention_mask
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.d_ff = d_ff
-            self.relative_attention_num_buckets = relative_attention_num_buckets
-            self.dropout_rate = dropout_rate
-            self.initializer_factor = initializer_factor
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            encoder_attention_mask = None
-            decoder_attention_mask = None
-            if self.use_attention_mask:
-                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-            decoder_lm_labels = None
-            if self.use_labels:
-                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-            config = T5Config(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                d_model=self.hidden_size,
-                d_ff=self.d_ff,
-                d_kv=self.hidden_size // self.num_attention_heads,
-                num_layers=self.num_hidden_layers,
-                num_heads=self.num_attention_heads,
-                relative_attention_num_buckets=self.relative_attention_num_buckets,
-                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor,
-            )
-
-            return (
-                config,
-                encoder_input_ids,
-                decoder_input_ids,
-                encoder_attention_mask,
-                decoder_attention_mask,
-                decoder_lm_labels,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_t5_model(
-            self,
-            config,
-            encoder_input_ids,
-            decoder_input_ids,
-            encoder_attention_mask,
-            decoder_attention_mask,
-            decoder_lm_labels,
-        ):
-            model = T5Model(config=config)
-            model.eval()
-            decoder_output, encoder_output = model(
-                encoder_input_ids=encoder_input_ids,
-                decoder_input_ids=decoder_input_ids,
-                encoder_attention_mask=encoder_attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            decoder_output, encoder_output = model(
-                encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids
-            )
-
-            result = {
-                "encoder_output": encoder_output,
-                "decoder_output": decoder_output,
-            }
-            self.parent.assertListEqual(
-                list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
-            )
-
-        def create_and_check_t5_with_lm_head(
-            self,
-            config,
-            encoder_input_ids,
-            decoder_input_ids,
-            encoder_attention_mask,
-            decoder_attention_mask,
-            decoder_lm_labels,
-        ):
-            model = T5WithLMHeadModel(config=config)
-            model.eval()
-            outputs = model(
-                encoder_input_ids=encoder_input_ids,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                decoder_lm_labels=decoder_lm_labels,
-            )
-            loss, prediction_scores = outputs[0], outputs[1]
-            result = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
-            )
-            self.check_loss_output(result)
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                encoder_input_ids,
-                decoder_input_ids,
-                encoder_attention_mask,
-                decoder_attention_mask,
-                decoder_lm_labels,
-            ) = config_and_inputs
-            inputs_dict = {
-                "encoder_input_ids": encoder_input_ids,
-                "decoder_input_ids": decoder_input_ids,
-                "decoder_attention_mask": decoder_attention_mask,
-                "encoder_attention_mask": encoder_attention_mask,
-            }
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = T5ModelTest.T5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_t5_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_albert.py b/server/transformers/tests/test_modeling_tf_albert.py
deleted file mode 100644
index fb7b269cdcabde5993e916c7ac8818e95b2932ff..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_albert.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import AlbertConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    from transformers.modeling_tf_albert import (
-        TFAlbertModel,
-        TFAlbertForMaskedLM,
-        TFAlbertForSequenceClassification,
-        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else ()
-    )
-
-    class TFAlbertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            embedding_size=16,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.embedding_size = embedding_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = AlbertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_albert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFAlbertModel(config=config)
-            # inputs = {'input_ids': input_ids,
-            #           'attention_mask': input_mask,
-            #           'token_type_ids': token_type_ids}
-            # sequence_output, pooled_output = model(**inputs)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output, pooled_output = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-
-        def create_and_check_albert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFAlbertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_albert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFAlbertForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_albert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_auto.py b/server/transformers/tests/test_modeling_tf_auto.py
deleted file mode 100644
index 6994f6eaa949c43be73219df544867d3d57a5bfd..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_auto.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import unittest
-
-from transformers import is_tf_available
-
-from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_tf, slow
-
-
-if is_tf_available():
-    from transformers import (
-        AutoConfig,
-        BertConfig,
-        TFAutoModel,
-        TFBertModel,
-        TFAutoModelForPreTraining,
-        TFBertForPreTraining,
-        TFAutoModelWithLMHead,
-        TFBertForMaskedLM,
-        TFRobertaForMaskedLM,
-        TFAutoModelForSequenceClassification,
-        TFBertForSequenceClassification,
-        TFAutoModelForQuestionAnswering,
-        TFBertForQuestionAnswering,
-    )
-
-
-@require_tf
-class TFAutoModelTest(unittest.TestCase):
-    @slow
-    def test_model_from_pretrained(self):
-        import h5py
-
-        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
-
-        logging.basicConfig(level=logging.INFO)
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertModel)
-
-    @slow
-    def test_model_for_pretraining_from_pretrained(self):
-        import h5py
-
-        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
-
-        logging.basicConfig(level=logging.INFO)
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForPreTraining.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForPreTraining)
-
-    @slow
-    def test_lmhead_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelWithLMHead.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForMaskedLM)
-
-    @slow
-    def test_sequence_classification_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForSequenceClassification)
-
-    @slow
-    def test_question_answering_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForQuestionAnswering)
-
-    def test_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
-        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14830)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
-
-    def test_from_identifier_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
-        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
-        self.assertIsInstance(model, TFRobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14830)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14830)
diff --git a/server/transformers/tests/test_modeling_tf_bert.py b/server/transformers/tests/test_modeling_tf_bert.py
deleted file mode 100644
index d91d4863afe42543d071eecc09864f1a0913ec80..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_bert.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import BertConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers.modeling_tf_bert import (
-        TFBertModel,
-        TFBertForMaskedLM,
-        TFBertForNextSentencePrediction,
-        TFBertForPreTraining,
-        TFBertForSequenceClassification,
-        TFBertForMultipleChoice,
-        TFBertForTokenClassification,
-        TFBertForQuestionAnswering,
-    )
-
-
-@require_tf
-class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            TFBertModel,
-            TFBertForMaskedLM,
-            TFBertForNextSentencePrediction,
-            TFBertForPreTraining,
-            TFBertForQuestionAnswering,
-            TFBertForSequenceClassification,
-            TFBertForTokenClassification,
-        )
-        if is_tf_available()
-        else ()
-    )
-
-    class TFBertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = BertConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_bert_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output, pooled_output = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
-
-        def create_and_check_bert_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_bert_for_next_sequence_prediction(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForNextSentencePrediction(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (seq_relationship_score,) = model(inputs)
-            result = {
-                "seq_relationship_score": seq_relationship_score.numpy(),
-            }
-            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-
-        def create_and_check_bert_for_pretraining(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForPreTraining(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores, seq_relationship_score = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-                "seq_relationship_score": seq_relationship_score.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
-
-        def create_and_check_bert_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFBertForSequenceClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def create_and_check_bert_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_choices = self.num_choices
-            model = TFBertForMultipleChoice(config=config)
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
-
-        def create_and_check_bert_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFBertForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def create_and_check_bert_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFBertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFBertModelTest.TFBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_bert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ["bert-base-uncased"]:
-            model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_common.py b/server/transformers/tests/test_modeling_tf_common.py
deleted file mode 100644
index bcfb6bfe5d457e7dece90b8a6aada8a670bd9b58..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_common.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import os
-import random
-import tempfile
-
-from transformers import is_tf_available, is_torch_available
-
-from .utils import require_tf
-
-
-if is_tf_available():
-    import tensorflow as tf
-    import numpy as np
-
-    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key:
-            setattr(configs_no_init, key, 0.0)
-    return configs_no_init
-
-
-@require_tf
-class TFModelTesterMixin:
-
-    model_tester = None
-    all_model_classes = ()
-    test_torchscript = True
-    test_pruning = True
-    test_resize_embeddings = True
-    is_encoder_decoder = False
-
-    def test_initialization(self):
-        pass
-        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # configs_no_init = _config_zero_init(config)
-        # for model_class in self.all_model_classes:
-        #     model = model_class(config=configs_no_init)
-        #     for name, param in model.named_parameters():
-        #         if param.requires_grad:
-        #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
-        #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(inputs_dict)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(inputs_dict)
-
-                # Make sure we don't have nans
-                out_1 = after_outputs[0].numpy()
-                out_2 = outputs[0].numpy()
-                out_1 = out_1[~np.isnan(out_1)]
-                out_2 = out_2[~np.isnan(out_2)]
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def test_pt_tf_model_equivalence(self):
-        if not is_torch_available():
-            return
-
-        import torch
-        import transformers
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            config.output_hidden_states = True
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = dict(
-                (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
-            )
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(inputs_dict, training=False)
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].numpy()
-
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
-
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            # Debug info (remove when fixed)
-            if max_diff >= 2e-2:
-                print("===")
-                print(model_class)
-                print(config)
-                print(inputs_dict)
-                print(pt_inputs_dict)
-            self.assertLessEqual(max_diff, 2e-2)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            pt_inputs_dict = dict(
-                (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
-            )
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs_dict)
-            tfo = tf_model(inputs_dict)
-            tfo = tfo[0].numpy()
-            pto = pto[0].numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
-
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 2e-2)
-
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        if self.is_encoder_decoder:
-            input_ids = {
-                "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-                "encoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32"),
-            }
-        else:
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            # Prepare our model
-            model = model_class(config)
-
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                outputs = model(inputs_dict)  # build the model
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(input_ids)
-            hidden_states = outputs_dict[0]
-
-            # Add a dense layer on top to test intetgration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-    def test_keyword_and_dict_args(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs_dict = model(inputs_dict)
-
-            inputs_keywords = copy.deepcopy(inputs_dict)
-            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None)
-            outputs_keywords = model(input_ids, **inputs_keywords)
-
-            output_dict = outputs_dict[0].numpy()
-            output_keywords = outputs_keywords[0].numpy()
-
-            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        decoder_seq_length = (
-            self.model_tester.decoder_seq_length
-            if hasattr(self.model_tester, "decoder_seq_length")
-            else self.model_tester.seq_length
-        )
-        encoder_seq_length = (
-            self.model_tester.encoder_seq_length
-            if hasattr(self.model_tester, "encoder_seq_length")
-            else self.model_tester.seq_length
-        )
-        decoder_key_length = (
-            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length
-        )
-        encoder_key_length = (
-            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
-        )
-
-        for model_class in self.all_model_classes:
-            config.output_attentions = True
-            config.output_hidden_states = False
-            model = model_class(config)
-            outputs = model(inputs_dict)
-            attentions = [t.numpy() for t in outputs[-1]]
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, False)
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                self.assertEqual(out_len % 2, 0)
-                decoder_attentions = outputs[(out_len // 2) - 1]
-                self.assertEqual(model.config.output_attentions, True)
-                self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-            # Check attention is always last and order is fine
-            config.output_attentions = True
-            config.output_hidden_states = True
-            model = model_class(config)
-            outputs = model(inputs_dict)
-            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-
-            attentions = [t.numpy() for t in outputs[-1]]
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-            config.output_attentions = False
-            model = model_class(config)
-            outputs = model(inputs_dict)
-            hidden_states = [t.numpy() for t in outputs[-1]]
-            self.assertEqual(model.config.output_attentions, False)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size]
-            )
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            x = model.get_output_embeddings()
-            assert x is None or isinstance(x, tf.keras.layers.Layer)
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-            out_1 = first.numpy()
-            out_2 = second.numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def _get_embeds(self, wte, input_ids):
-        # ^^ In our TF models, the input_embeddings can take slightly different forms,
-        # so we try a few of them.
-        # We used to fall back to just synthetically creating a dummy tensor of ones:
-        try:
-            x = wte(input_ids, mode="embedding")
-        except Exception:
-            try:
-                x = wte([input_ids], mode="embedding")
-            except Exception:
-                try:
-                    x = wte([input_ids, None, None, None], mode="embedding")
-                except Exception:
-                    if hasattr(self.model_tester, "embedding_size"):
-                        x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                    else:
-                        x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-        return x
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.is_encoder_decoder:
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
-        else:
-            encoder_input_ids = inputs_dict["encoder_input_ids"]
-            decoder_input_ids = inputs_dict["decoder_input_ids"]
-            del inputs_dict["encoder_input_ids"]
-            del inputs_dict["decoder_input_ids"]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
-            else:
-                inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
-                inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
-
-            model(inputs_dict)
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
-
-    return output
diff --git a/server/transformers/tests/test_modeling_tf_ctrl.py b/server/transformers/tests/test_modeling_tf_ctrl.py
deleted file mode 100644
index 4997c2a573a12c87071ae4fbdf6aeec1c6ac9646..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_ctrl.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import CTRLConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_tf
-class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
-
-    class TFCTRLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = CTRLConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFCTRLModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFCTRLLMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFCTRLModelTest.TFCTRLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_ctrl_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
-
-    def test_ctrl_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_distilbert.py b/server/transformers/tests/test_modeling_tf_distilbert.py
deleted file mode 100644
index 5546e7a5b850412a82c14332da37b041b1e3adac..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_distilbert.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import DistilBertConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import require_tf
-
-
-if is_tf_available():
-    from transformers.modeling_tf_distilbert import (
-        TFDistilBertModel,
-        TFDistilBertForMaskedLM,
-        TFDistilBertForQuestionAnswering,
-        TFDistilBertForSequenceClassification,
-    )
-
-
-@require_tf
-class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            TFDistilBertModel,
-            TFDistilBertForMaskedLM,
-            TFDistilBertForQuestionAnswering,
-            TFDistilBertForSequenceClassification,
-        )
-        if is_tf_available()
-        else None
-    )
-    test_pruning = True
-    test_torchscript = True
-    test_resize_embeddings = True
-    test_head_masking = True
-
-    class TFDistilBertModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=False,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = DistilBertConfig(
-                vocab_size=self.vocab_size,
-                dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                hidden_dim=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_distilbert_model(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-
-            outputs = model(inputs)
-            sequence_output = outputs[0]
-
-            inputs = [input_ids, input_mask]
-
-            (sequence_output,) = model(inputs)
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_distilbert_for_masked_lm(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertForMaskedLM(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            (prediction_scores,) = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_distilbert_for_question_answering(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFDistilBertForQuestionAnswering(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            start_logits, end_logits = model(inputs)
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def create_and_check_distilbert_for_sequence_classification(
-            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFDistilBertForSequenceClassification(config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFDistilBertModelTest.TFDistilBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_distilbert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
-
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModesss.from_pretrained(model_name, cache_dir=CACHE_DIR)
-    #         self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_gpt2.py b/server/transformers/tests/test_modeling_tf_gpt2.py
deleted file mode 100644
index d7b0809964799f35eebe87144dddf4d7e01b0960..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_gpt2.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import GPT2Config, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers.modeling_tf_gpt2 import (
-        TFGPT2Model,
-        TFGPT2LMHeadModel,
-        TFGPT2DoubleHeadsModel,
-        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
-    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
-
-    class TFGPT2ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = GPT2Config(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFGPT2Model(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFGPT2LMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_gpt2_double_head(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = TFGPT2DoubleHeadsModel(config=config)
-
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
-            self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt2_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
-
-    def test_gpt2_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
-
-    def test_gpt2_double_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_openai_gpt.py b/server/transformers/tests/test_modeling_tf_openai_gpt.py
deleted file mode 100644
index b825c94fca27aeb4c598ba1ea08bb55bb6cfef96..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_openai_gpt.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import OpenAIGPTConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers.modeling_tf_openai import (
-        TFOpenAIGPTModel,
-        TFOpenAIGPTLMHeadModel,
-        TFOpenAIGPTDoubleHeadsModel,
-        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
-    )
-
-    class TFOpenAIGPTModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_token_type_ids=True,
-            use_input_mask=True,
-            use_labels=True,
-            use_mc_token_ids=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_token_type_ids = use_token_type_ids
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.use_mc_token_ids = use_mc_token_ids
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            mc_token_ids = None
-            if self.use_mc_token_ids:
-                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = OpenAIGPTConfig(
-                vocab_size=self.vocab_size,
-                n_embd=self.hidden_size,
-                n_layer=self.num_hidden_layers,
-                n_head=self.num_attention_heads,
-                # intermediate_size=self.intermediate_size,
-                # hidden_act=self.hidden_act,
-                # hidden_dropout_prob=self.hidden_dropout_prob,
-                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                n_positions=self.max_position_embeddings,
-                n_ctx=self.max_position_embeddings
-                # type_vocab_size=self.type_vocab_size,
-                # initializer_range=self.initializer_range
-            )
-
-            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-            return (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            )
-
-        def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFOpenAIGPTModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, input_mask]
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-            model = TFOpenAIGPTLMHeadModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            prediction_scores = model(inputs)[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_openai_gpt_double_head(
-            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-        ):
-            model = TFOpenAIGPTDoubleHeadsModel(config=config)
-
-            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
-            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
-            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
-
-            inputs = {
-                "input_ids": multiple_choice_inputs_ids,
-                "mc_token_ids": mc_token_ids,
-                "attention_mask": multiple_choice_input_mask,
-                "token_type_ids": multiple_choice_token_type_ids,
-            }
-            lm_logits, mc_logits = model(inputs)[:2]
-            result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
-            self.parent.assertListEqual(
-                list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-
-            (
-                config,
-                input_ids,
-                input_mask,
-                head_mask,
-                token_type_ids,
-                mc_token_ids,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFOpenAIGPTModelTest.TFOpenAIGPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_openai_gpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
-
-    def test_openai_gpt_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_lm_head(*config_and_inputs)
-
-    def test_openai_gpt_double_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_roberta.py b/server/transformers/tests/test_modeling_tf_roberta.py
deleted file mode 100644
index 21b0ffee0e8069b9aa56afad5678d870445aea41..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_roberta.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import RobertaConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    import numpy
-    from transformers.modeling_tf_roberta import (
-        TFRobertaModel,
-        TFRobertaForMaskedLM,
-        TFRobertaForSequenceClassification,
-        TFRobertaForTokenClassification,
-        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else ()
-    )
-
-    class TFRobertaModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            config = RobertaConfig(
-                vocab_size=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range,
-            )
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def create_and_check_roberta_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFRobertaModel(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            sequence_output = model(inputs)[0]
-
-            inputs = [input_ids, input_mask]
-            sequence_output = model(inputs)[0]
-
-            sequence_output = model(input_ids)[0]
-
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_roberta_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            model = TFRobertaForMaskedLM(config=config)
-            prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_roberta_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-        ):
-            config.num_labels = self.num_labels
-            model = TFRobertaForTokenClassification(config=config)
-            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-            (logits,) = model(inputs)
-            result = {
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_mask,
-                sequence_labels,
-                token_labels,
-                choice_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFRobertaModelTest.TFRobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_roberta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFRobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
-
-
-class TFRobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 11, 50265]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
-
-    @slow
-    def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained("roberta-base")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = tf.constant(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]]
-        )
-        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-
-        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = [1, 3]
-        self.assertEqual(list(output.numpy().shape), expected_shape)
-        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3))
diff --git a/server/transformers/tests/test_modeling_tf_t5.py b/server/transformers/tests/test_modeling_tf_t5.py
deleted file mode 100644
index d5589eaf165cbdcb42c8a60c77eb9be4f9493930..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_t5.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import T5Config, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
-
-
-@require_tf
-class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    is_encoder_decoder = True
-    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
-
-    class TFT5ModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_labels=True,
-            vocab_size=99,
-            n_positions=14,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            d_ff=37,
-            relative_attention_num_buckets=8,
-            dropout_rate=0.1,
-            initializer_factor=0.002,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_positions = n_positions
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.d_ff = d_ff
-            self.relative_attention_num_buckets = relative_attention_num_buckets
-            self.dropout_rate = dropout_rate
-            self.initializer_factor = initializer_factor
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_labels = None
-            if self.use_labels:
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            config = T5Config(
-                vocab_size=self.vocab_size,
-                n_positions=self.n_positions,
-                d_model=self.hidden_size,
-                d_ff=self.d_ff,
-                d_kv=self.hidden_size // self.num_attention_heads,
-                num_layers=self.num_hidden_layers,
-                num_heads=self.num_attention_heads,
-                relative_attention_num_buckets=self.relative_attention_num_buckets,
-                dropout_rate=self.dropout_rate,
-                initializer_factor=self.initializer_factor,
-            )
-
-            return (config, input_ids, input_mask, token_labels)
-
-        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
-            model = TFT5Model(config=config)
-            inputs = {
-                "encoder_input_ids": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-            }
-            encoder_output, decoder_output = model(inputs)
-
-            encoder_output, decoder_output = model(
-                input_ids, decoder_attention_mask=input_mask, encoder_input_ids=input_ids
-            )
-
-            result = {
-                "encoder_output": encoder_output.numpy(),
-                "decoder_output": decoder_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
-            model = TFT5WithLMHeadModel(config=config)
-            inputs = {
-                "encoder_input_ids": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-            }
-            prediction_scores, decoder_output = model(inputs)
-            result = {
-                "prediction_scores": prediction_scores.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, token_labels) = config_and_inputs
-            inputs_dict = {
-                "encoder_input_ids": input_ids,
-                "decoder_input_ids": input_ids,
-                "decoder_attention_mask": input_mask,
-            }
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_t5_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["t5-small"]:
-            model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_transfo_xl.py b/server/transformers/tests/test_modeling_tf_transfo_xl.py
deleted file mode 100644
index f94f2032a26b753f6003372499e0fccdbab4d864..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_transfo_xl.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import random
-import unittest
-
-from transformers import TransfoXLConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers.modeling_tf_transfo_xl import (
-        TFTransfoXLModel,
-        TFTransfoXLLMHeadModel,
-        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-
-    class TFTransfoXLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            mem_len=30,
-            clamp_len=15,
-            is_training=True,
-            use_labels=True,
-            vocab_size=99,
-            cutoffs=[10, 50, 80],
-            hidden_size=32,
-            d_embed=32,
-            num_attention_heads=4,
-            d_head=8,
-            d_inner=128,
-            div_val=2,
-            num_hidden_layers=5,
-            scope=None,
-            seed=1,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.mem_len = mem_len
-            self.key_length = seq_length + mem_len
-            self.clamp_len = clamp_len
-            self.is_training = is_training
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.cutoffs = cutoffs
-            self.hidden_size = hidden_size
-            self.d_embed = d_embed
-            self.num_attention_heads = num_attention_heads
-            self.d_head = d_head
-            self.d_inner = d_inner
-            self.div_val = div_val
-            self.num_hidden_layers = num_hidden_layers
-            self.scope = scope
-            self.seed = seed
-
-        def prepare_config_and_inputs(self):
-            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            lm_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            config = TransfoXLConfig(
-                vocab_size=self.vocab_size,
-                mem_len=self.mem_len,
-                clamp_len=self.clamp_len,
-                cutoffs=self.cutoffs,
-                d_model=self.hidden_size,
-                d_embed=self.d_embed,
-                n_head=self.num_attention_heads,
-                d_head=self.d_head,
-                d_inner=self.d_inner,
-                div_val=self.div_val,
-                n_layer=self.num_hidden_layers,
-            )
-
-            return (config, input_ids_1, input_ids_2, lm_labels)
-
-        def set_seed(self):
-            random.seed(self.seed)
-            tf.random.set_seed(self.seed)
-
-        def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TFTransfoXLModel(config)
-
-            hidden_states_1, mems_1 = model(input_ids_1)
-
-            inputs = {"input_ids": input_ids_2, "mems": mems_1}
-
-            hidden_states_2, mems_2 = model(inputs)
-
-            result = {
-                "hidden_states_1": hidden_states_1.numpy(),
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "hidden_states_2": hidden_states_2.numpy(),
-                "mems_2": [mem.numpy() for mem in mems_2],
-            }
-
-            self.parent.assertListEqual(
-                list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TFTransfoXLLMHeadModel(config)
-
-            lm_logits_1, mems_1 = model(input_ids_1)
-
-            inputs = {"input_ids": input_ids_1, "labels": lm_labels}
-            _, mems_1 = model(inputs)
-
-            lm_logits_2, mems_2 = model([input_ids_2, mems_1])
-
-            inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
-
-            _, mems_2 = model(inputs)
-
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "lm_logits_1": lm_logits_1.numpy(),
-                "mems_2": [mem.numpy() for mem in mems_2],
-                "lm_logits_2": lm_logits_2.numpy(),
-            }
-
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_transfo_xl_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_model(*config_and_inputs)
-
-    def test_transfo_xl_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_xlm.py b/server/transformers/tests/test_modeling_tf_xlm.py
deleted file mode 100644
index 53719f63f4bda65d759df84e14039a329872402e..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_xlm.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers import (
-        XLMConfig,
-        TFXLMModel,
-        TFXLMWithLMHeadModel,
-        TFXLMForSequenceClassification,
-        TFXLMForQuestionAnsweringSimple,
-        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
-        if is_tf_available()
-        else ()
-    )
-
-    class TFXLMModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_lengths=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            gelu_activation=True,
-            sinusoidal_embeddings=False,
-            causal=False,
-            asm=False,
-            n_langs=2,
-            vocab_size=99,
-            n_special=0,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            summary_type="last",
-            use_proj=True,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_lengths = use_input_lengths
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.asm = asm
-            self.n_langs = n_langs
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.summary_type = summary_type
-            self.causal = causal
-            self.use_proj = use_proj
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.n_langs = n_langs
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.summary_type = summary_type
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
-
-            input_lengths = None
-            if self.use_input_lengths:
-                input_lengths = (
-                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-                )  # small variation of seq_length
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-            sequence_labels = None
-            token_labels = None
-            is_impossible_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
-
-            config = XLMConfig(
-                vocab_size=self.vocab_size,
-                n_special=self.n_special,
-                emb_dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                gelu_activation=self.gelu_activation,
-                sinusoidal_embeddings=self.sinusoidal_embeddings,
-                asm=self.asm,
-                causal=self.causal,
-                n_langs=self.n_langs,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-                summary_type=self.summary_type,
-                use_proj=self.use_proj,
-            )
-
-            return (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            )
-
-        def create_and_check_xlm_model(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = TFXLMModel(config=config)
-            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-            outputs = model(inputs)
-
-            inputs = [input_ids, input_mask]
-            outputs = model(inputs)
-            sequence_output = outputs[0]
-            result = {
-                "sequence_output": sequence_output.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_xlm_lm_head(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = TFXLMWithLMHeadModel(config)
-
-            inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
-            outputs = model(inputs)
-
-            logits = outputs[0]
-
-            result = {
-                "logits": logits.numpy(),
-            }
-
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_xlm_qa(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = TFXLMForQuestionAnsweringSimple(config)
-
-            inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-            start_logits, end_logits = model(inputs)
-
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-            }
-
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-
-        def create_and_check_xlm_sequence_classif(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = TFXLMForSequenceClassification(config)
-
-            inputs = {"input_ids": input_ids, "lengths": input_lengths}
-
-            (logits,) = model(inputs)
-
-            result = {
-                "logits": logits.numpy(),
-            }
-
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            ) = config_and_inputs
-            inputs_dict = {
-                "input_ids": input_ids,
-                "token_type_ids": token_type_ids,
-                "langs": token_type_ids,
-                "lengths": input_lengths,
-            }
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFXLMModelTest.TFXLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlm_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
-
-    def test_xlm_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
-
-    def test_xlm_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
-
-    def test_xlm_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFXLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_tf_xlnet.py b/server/transformers/tests/test_modeling_tf_xlnet.py
deleted file mode 100644
index 65c83395e542e90bd70117e3ab819b0d70e60183..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_tf_xlnet.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import random
-import unittest
-
-from transformers import XLNetConfig, is_tf_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_tf, slow
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.modeling_tf_xlnet import (
-        TFXLNetModel,
-        TFXLNetLMHeadModel,
-        TFXLNetForSequenceClassification,
-        TFXLNetForTokenClassification,
-        TFXLNetForQuestionAnsweringSimple,
-        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    )
-
-
-@require_tf
-class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            TFXLNetModel,
-            TFXLNetLMHeadModel,
-            TFXLNetForSequenceClassification,
-            TFXLNetForTokenClassification,
-            TFXLNetForQuestionAnsweringSimple,
-        )
-        if is_tf_available()
-        else ()
-    )
-    test_pruning = False
-
-    class TFXLNetModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            mem_len=10,
-            clamp_len=-1,
-            reuse_len=15,
-            is_training=True,
-            use_labels=True,
-            vocab_size=99,
-            cutoffs=[10, 50, 80],
-            hidden_size=32,
-            num_attention_heads=4,
-            d_inner=128,
-            num_hidden_layers=5,
-            type_sequence_label_size=2,
-            untie_r=True,
-            bi_data=False,
-            same_length=False,
-            initializer_range=0.05,
-            seed=1,
-            type_vocab_size=2,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.mem_len = mem_len
-            # self.key_len = seq_length + mem_len
-            self.clamp_len = clamp_len
-            self.reuse_len = reuse_len
-            self.is_training = is_training
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.cutoffs = cutoffs
-            self.hidden_size = hidden_size
-            self.num_attention_heads = num_attention_heads
-            self.d_inner = d_inner
-            self.num_hidden_layers = num_hidden_layers
-            self.bi_data = bi_data
-            self.untie_r = untie_r
-            self.same_length = same_length
-            self.initializer_range = initializer_range
-            self.seed = seed
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-
-        def prepare_config_and_inputs(self):
-            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
-
-            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
-            perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
-            perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
-            # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
-            target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
-            target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
-            # target_mapping[:, 0, -1] = 1.0  # predict last token
-
-            sequence_labels = None
-            lm_labels = None
-            is_impossible_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
-
-            config = XLNetConfig(
-                vocab_size=self.vocab_size,
-                d_model=self.hidden_size,
-                n_head=self.num_attention_heads,
-                d_inner=self.d_inner,
-                n_layer=self.num_hidden_layers,
-                untie_r=self.untie_r,
-                mem_len=self.mem_len,
-                clamp_len=self.clamp_len,
-                same_length=self.same_length,
-                reuse_len=self.reuse_len,
-                bi_data=self.bi_data,
-                initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size,
-            )
-
-            return (
-                config,
-                input_ids_1,
-                input_ids_2,
-                input_ids_q,
-                perm_mask,
-                input_mask,
-                target_mapping,
-                segment_ids,
-                lm_labels,
-                sequence_labels,
-                is_impossible_labels,
-            )
-
-        def set_seed(self):
-            random.seed(self.seed)
-            tf.random.set_seed(self.seed)
-
-        def create_and_check_xlnet_base_model(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ):
-            model = TFXLNetModel(config)
-
-            inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
-
-            _, _ = model(inputs)
-
-            inputs = [input_ids_1, input_mask]
-
-            outputs, mems_1 = model(inputs)
-
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "outputs": outputs.numpy(),
-            }
-
-            config.mem_len = 0
-            model = TFXLNetModel(config)
-            no_mems_outputs = model(inputs)
-            self.parent.assertEqual(len(no_mems_outputs), 1)
-
-            self.parent.assertListEqual(
-                list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_lm_head(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ):
-            model = TFXLNetLMHeadModel(config)
-
-            inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
-
-            all_logits_1, mems_1 = model(inputs_1)
-
-            inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
-
-            all_logits_2, mems_2 = model(inputs_2)
-
-            inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
-
-            logits, _ = model(inputs_3)
-
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "all_logits_1": all_logits_1.numpy(),
-                "mems_2": [mem.numpy() for mem in mems_2],
-                "all_logits_2": all_logits_2.numpy(),
-            }
-
-            self.parent.assertListEqual(
-                list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-            self.parent.assertListEqual(
-                list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_qa(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ):
-            model = TFXLNetForQuestionAnsweringSimple(config)
-
-            inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
-            start_logits, end_logits, mems = model(inputs)
-
-            result = {
-                "start_logits": start_logits.numpy(),
-                "end_logits": end_logits.numpy(),
-                "mems": [m.numpy() for m in mems],
-            }
-
-            self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_sequence_classif(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ):
-            model = TFXLNetForSequenceClassification(config)
-
-            logits, mems_1 = model(input_ids_1)
-
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "logits": logits.numpy(),
-            }
-
-            self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_for_token_classification(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-        ):
-            config.num_labels = input_ids_1.shape[1]
-            model = TFXLNetForTokenClassification(config)
-            inputs = {
-                "input_ids": input_ids_1,
-                "attention_mask": input_mask,
-                # 'token_type_ids': token_type_ids
-            }
-            logits, mems_1 = model(inputs)
-            result = {
-                "mems_1": [mem.numpy() for mem in mems_1],
-                "logits": logits.numpy(),
-            }
-            self.parent.assertListEqual(
-                list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.shape) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids_1,
-                input_ids_2,
-                input_ids_q,
-                perm_mask,
-                input_mask,
-                target_mapping,
-                segment_ids,
-                lm_labels,
-                sequence_labels,
-                is_impossible_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlnet_base_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
-
-    def test_xlnet_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
-
-    def test_xlnet_sequence_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-
-    def test_xlnet_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
-
-    def test_xlnet_qa(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFXLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_transfo_xl.py b/server/transformers/tests/test_modeling_transfo_xl.py
deleted file mode 100644
index b06bd8510673a84a97c44e82e8b0e14f0db42144..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_transfo_xl.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import random
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-    from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
-    from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-
-    class TransfoXLModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            mem_len=30,
-            clamp_len=15,
-            is_training=True,
-            use_labels=True,
-            vocab_size=99,
-            cutoffs=[10, 50, 80],
-            hidden_size=32,
-            d_embed=32,
-            num_attention_heads=4,
-            d_head=8,
-            d_inner=128,
-            div_val=2,
-            num_hidden_layers=5,
-            scope=None,
-            seed=1,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.mem_len = mem_len
-            self.key_length = seq_length + mem_len
-            self.clamp_len = clamp_len
-            self.is_training = is_training
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.cutoffs = cutoffs
-            self.hidden_size = hidden_size
-            self.d_embed = d_embed
-            self.num_attention_heads = num_attention_heads
-            self.d_head = d_head
-            self.d_inner = d_inner
-            self.div_val = div_val
-            self.num_hidden_layers = num_hidden_layers
-            self.scope = scope
-            self.seed = seed
-
-        def prepare_config_and_inputs(self):
-            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            lm_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            config = TransfoXLConfig(
-                vocab_size=self.vocab_size,
-                mem_len=self.mem_len,
-                clamp_len=self.clamp_len,
-                cutoffs=self.cutoffs,
-                d_model=self.hidden_size,
-                d_embed=self.d_embed,
-                n_head=self.num_attention_heads,
-                d_head=self.d_head,
-                d_inner=self.d_inner,
-                div_val=self.div_val,
-                n_layer=self.num_hidden_layers,
-            )
-
-            return (config, input_ids_1, input_ids_2, lm_labels)
-
-        def set_seed(self):
-            random.seed(self.seed)
-            torch.manual_seed(self.seed)
-
-        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TransfoXLModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            hidden_states_1, mems_1 = model(input_ids_1)
-            hidden_states_2, mems_2 = model(input_ids_2, mems_1)
-            outputs = {
-                "hidden_states_1": hidden_states_1,
-                "mems_1": mems_1,
-                "hidden_states_2": hidden_states_2,
-                "mems_2": mems_2,
-            }
-            return outputs
-
-        def check_transfo_xl_model_output(self, result):
-            self.parent.assertListEqual(
-                list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
-            model = TransfoXLLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            lm_logits_1, mems_1 = model(input_ids_1)
-            loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels)
-            lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1)
-            loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
-
-            outputs = {
-                "loss_1": loss_1,
-                "mems_1": mems_1,
-                "lm_logits_1": lm_logits_1,
-                "loss_2": loss_2,
-                "mems_2": mems_2,
-                "lm_logits_2": lm_logits_2,
-            }
-            return outputs
-
-        def check_transfo_xl_lm_head_output(self, result):
-            self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-            self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_transfo_xl_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
-        self.model_tester.check_transfo_xl_model_output(output_result)
-
-    def test_transfo_xl_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
-        self.model_tester.check_transfo_xl_lm_head_output(output_result)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_xlm.py b/server/transformers/tests/test_modeling_xlm.py
deleted file mode 100644
index df5ac260fabdd0018657b99b7ccfbe4994aa44db..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_xlm.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    from transformers import (
-        XLMConfig,
-        XLMModel,
-        XLMWithLMHeadModel,
-        XLMForQuestionAnswering,
-        XLMForSequenceClassification,
-        XLMForQuestionAnsweringSimple,
-    )
-    from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class XLMModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            XLMModel,
-            XLMWithLMHeadModel,
-            XLMForQuestionAnswering,
-            XLMForSequenceClassification,
-            XLMForQuestionAnsweringSimple,
-        )
-        if is_torch_available()
-        else ()
-    )
-
-    class XLMModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_lengths=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            gelu_activation=True,
-            sinusoidal_embeddings=False,
-            causal=False,
-            asm=False,
-            n_langs=2,
-            vocab_size=99,
-            n_special=0,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            summary_type="last",
-            use_proj=True,
-            scope=None,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_lengths = use_input_lengths
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.asm = asm
-            self.n_langs = n_langs
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.summary_type = summary_type
-            self.causal = causal
-            self.use_proj = use_proj
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.n_langs = n_langs
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.summary_type = summary_type
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
-
-            input_lengths = None
-            if self.use_input_lengths:
-                input_lengths = (
-                    ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-                )  # small variation of seq_length
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-            sequence_labels = None
-            token_labels = None
-            is_impossible_labels = None
-            if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-
-            config = XLMConfig(
-                vocab_size=self.vocab_size,
-                n_special=self.n_special,
-                emb_dim=self.hidden_size,
-                n_layers=self.num_hidden_layers,
-                n_heads=self.num_attention_heads,
-                dropout=self.hidden_dropout_prob,
-                attention_dropout=self.attention_probs_dropout_prob,
-                gelu_activation=self.gelu_activation,
-                sinusoidal_embeddings=self.sinusoidal_embeddings,
-                asm=self.asm,
-                causal=self.causal,
-                n_langs=self.n_langs,
-                max_position_embeddings=self.max_position_embeddings,
-                initializer_range=self.initializer_range,
-                summary_type=self.summary_type,
-                use_proj=self.use_proj,
-            )
-
-            return (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            )
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-
-        def create_and_check_xlm_model(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = XLMModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
-            outputs = model(input_ids, langs=token_type_ids)
-            outputs = model(input_ids)
-            sequence_output = outputs[0]
-            result = {
-                "sequence_output": sequence_output,
-            }
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-
-        def create_and_check_xlm_lm_head(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = XLMWithLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
-
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-
-        def create_and_check_xlm_simple_qa(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = XLMForQuestionAnsweringSimple(config)
-            model.to(torch_device)
-            model.eval()
-
-            outputs = model(input_ids)
-
-            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-            loss, start_logits, end_logits = outputs
-
-            result = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
-            self.check_loss_output(result)
-
-        def create_and_check_xlm_qa(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = XLMForQuestionAnswering(config)
-            model.to(torch_device)
-            model.eval()
-
-            outputs = model(input_ids)
-            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
-
-            outputs = model(
-                input_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-                p_mask=input_mask,
-            )
-
-            outputs = model(
-                input_ids,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-            )
-
-            (total_loss,) = outputs
-
-            outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-
-            (total_loss,) = outputs
-
-            result = {
-                "loss": total_loss,
-                "start_top_log_probs": start_top_log_probs,
-                "start_top_index": start_top_index,
-                "end_top_log_probs": end_top_log_probs,
-                "end_top_index": end_top_index,
-                "cls_logits": cls_logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
-
-        def create_and_check_xlm_sequence_classif(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            input_mask,
-        ):
-            model = XLMForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-
-            (logits,) = model(input_ids)
-            loss, logits = model(input_ids, labels=sequence_labels)
-
-            result = {
-                "loss": loss,
-                "logits": logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids,
-                token_type_ids,
-                input_lengths,
-                sequence_labels,
-                token_labels,
-                is_impossible_labels,
-                input_mask,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = XLMModelTest.XLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlm_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
-
-    def test_xlm_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
-
-    def test_xlm_simple_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs)
-
-    def test_xlm_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
-
-    def test_xlm_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_modeling_xlnet.py b/server/transformers/tests/test_modeling_xlnet.py
deleted file mode 100644
index 8b57e4ae82a26e44af82a14b3024009073d213ba..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_modeling_xlnet.py
+++ /dev/null
@@ -1,501 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import random
-import unittest
-
-from transformers import is_torch_available
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        XLNetConfig,
-        XLNetModel,
-        XLNetLMHeadModel,
-        XLNetForSequenceClassification,
-        XLNetForTokenClassification,
-        XLNetForQuestionAnswering,
-    )
-    from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
-
-    all_model_classes = (
-        (
-            XLNetModel,
-            XLNetLMHeadModel,
-            XLNetForTokenClassification,
-            XLNetForSequenceClassification,
-            XLNetForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-
-    class XLNetModelTester(object):
-        def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            mem_len=10,
-            clamp_len=-1,
-            reuse_len=15,
-            is_training=True,
-            use_labels=True,
-            vocab_size=99,
-            cutoffs=[10, 50, 80],
-            hidden_size=32,
-            num_attention_heads=4,
-            d_inner=128,
-            num_hidden_layers=5,
-            type_sequence_label_size=2,
-            untie_r=True,
-            bi_data=False,
-            same_length=False,
-            initializer_range=0.05,
-            seed=1,
-            type_vocab_size=2,
-        ):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.mem_len = mem_len
-            # self.key_len = seq_length + mem_len
-            self.clamp_len = clamp_len
-            self.reuse_len = reuse_len
-            self.is_training = is_training
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.cutoffs = cutoffs
-            self.hidden_size = hidden_size
-            self.num_attention_heads = num_attention_heads
-            self.d_inner = d_inner
-            self.num_hidden_layers = num_hidden_layers
-            self.bi_data = bi_data
-            self.untie_r = untie_r
-            self.same_length = same_length
-            self.initializer_range = initializer_range
-            self.seed = seed
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-
-        def prepare_config_and_inputs(self):
-            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
-
-            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = torch.zeros(
-                self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device
-            )
-            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = torch.zeros(
-                self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device
-            )
-            target_mapping[:, 0, -1] = 1.0  # predict last token
-
-            sequence_labels = None
-            lm_labels = None
-            is_impossible_labels = None
-            token_labels = None
-            if self.use_labels:
-                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            config = XLNetConfig(
-                vocab_size=self.vocab_size,
-                d_model=self.hidden_size,
-                n_head=self.num_attention_heads,
-                d_inner=self.d_inner,
-                n_layer=self.num_hidden_layers,
-                untie_r=self.untie_r,
-                mem_len=self.mem_len,
-                clamp_len=self.clamp_len,
-                same_length=self.same_length,
-                reuse_len=self.reuse_len,
-                bi_data=self.bi_data,
-                initializer_range=self.initializer_range,
-                num_labels=self.type_sequence_label_size,
-            )
-
-            return (
-                config,
-                input_ids_1,
-                input_ids_2,
-                input_ids_q,
-                perm_mask,
-                input_mask,
-                target_mapping,
-                segment_ids,
-                lm_labels,
-                sequence_labels,
-                is_impossible_labels,
-                token_labels,
-            )
-
-        def set_seed(self):
-            random.seed(self.seed)
-            torch.manual_seed(self.seed)
-
-        def create_and_check_xlnet_base_model(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            _, _ = model(input_ids_1, input_mask=input_mask)
-            _, _ = model(input_ids_1, attention_mask=input_mask)
-            _, _ = model(input_ids_1, token_type_ids=segment_ids)
-            outputs, mems_1 = model(input_ids_1)
-
-            result = {
-                "mems_1": mems_1,
-                "outputs": outputs,
-            }
-
-            config.mem_len = 0
-            model = XLNetModel(config)
-            model.to(torch_device)
-            model.eval()
-            no_mems_outputs = model(input_ids_1)
-            self.parent.assertEqual(len(no_mems_outputs), 1)
-
-            self.parent.assertListEqual(
-                list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_base_model_with_att_output(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            _, _, attentions = model(input_ids_1, target_mapping=target_mapping)
-
-            self.parent.assertEqual(len(attentions), config.n_layer)
-            self.parent.assertIsInstance(attentions[0], tuple)
-            self.parent.assertEqual(len(attentions[0]), 2)
-            self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
-
-        def create_and_check_xlnet_lm_head(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetLMHeadModel(config)
-            model.to(torch_device)
-            model.eval()
-
-            loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
-
-            loss_2, all_logits_2, mems_2 = model(
-                input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1
-            )
-
-            logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
-
-            result = {
-                "loss_1": loss_1,
-                "mems_1": mems_1,
-                "all_logits_1": all_logits_1,
-                "loss_2": loss_2,
-                "mems_2": mems_2,
-                "all_logits_2": all_logits_2,
-            }
-
-            self.parent.assertListEqual(list(result["loss_1"].size()), [])
-            self.parent.assertListEqual(
-                list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-            self.parent.assertListEqual(list(result["loss_2"].size()), [])
-            self.parent.assertListEqual(
-                list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_qa(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetForQuestionAnswering(config)
-            model.to(torch_device)
-            model.eval()
-
-            outputs = model(input_ids_1)
-            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
-
-            outputs = model(
-                input_ids_1,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-                p_mask=input_mask,
-            )
-
-            outputs = model(
-                input_ids_1,
-                start_positions=sequence_labels,
-                end_positions=sequence_labels,
-                cls_index=sequence_labels,
-                is_impossible=is_impossible_labels,
-            )
-
-            total_loss, mems = outputs
-
-            outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels)
-
-            total_loss, mems = outputs
-
-            result = {
-                "loss": total_loss,
-                "start_top_log_probs": start_top_log_probs,
-                "start_top_index": start_top_index,
-                "end_top_log_probs": end_top_log_probs,
-                "end_top_index": end_top_index,
-                "cls_logits": cls_logits,
-                "mems": mems,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_log_probs"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(
-                list(result["end_top_index"].size()),
-                [self.batch_size, model.config.start_n_top * model.config.end_n_top],
-            )
-            self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_token_classif(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetForTokenClassification(config)
-            model.to(torch_device)
-            model.eval()
-
-            logits, mems_1 = model(input_ids_1)
-            loss, logits, mems_1 = model(input_ids_1, labels=token_labels)
-
-            result = {
-                "loss": loss,
-                "mems_1": mems_1,
-                "logits": logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def create_and_check_xlnet_sequence_classif(
-            self,
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ):
-            model = XLNetForSequenceClassification(config)
-            model.to(torch_device)
-            model.eval()
-
-            logits, mems_1 = model(input_ids_1)
-            loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)
-
-            result = {
-                "loss": loss,
-                "mems_1": mems_1,
-                "logits": logits,
-            }
-
-            self.parent.assertListEqual(list(result["loss"].size()), [])
-            self.parent.assertListEqual(
-                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
-            )
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
-            )
-
-        def prepare_config_and_inputs_for_common(self):
-            config_and_inputs = self.prepare_config_and_inputs()
-            (
-                config,
-                input_ids_1,
-                input_ids_2,
-                input_ids_q,
-                perm_mask,
-                input_mask,
-                target_mapping,
-                segment_ids,
-                lm_labels,
-                sequence_labels,
-                is_impossible_labels,
-                token_labels,
-            ) = config_and_inputs
-            inputs_dict = {"input_ids": input_ids_1}
-            return config, inputs_dict
-
-    def setUp(self):
-        self.model_tester = XLNetModelTest.XLNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlnet_base_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
-
-    def test_xlnet_base_model_with_att_output(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config_and_inputs[0].output_attentions = True
-        self.model_tester.create_and_check_xlnet_base_model_with_att_output(*config_and_inputs)
-
-    def test_xlnet_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
-
-    def test_xlnet_sequence_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-
-    def test_xlnet_token_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_token_classif(*config_and_inputs)
-
-    def test_xlnet_qa(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
-            self.assertIsNotNone(model)
diff --git a/server/transformers/tests/test_optimization.py b/server/transformers/tests/test_optimization.py
deleted file mode 100644
index 8c9ebb2dd27a96cb9f60b1e8d4068af54a9ba8b7..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_optimization.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import tempfile
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        AdamW,
-        get_constant_schedule,
-        get_constant_schedule_with_warmup,
-        get_cosine_schedule_with_warmup,
-        get_cosine_with_hard_restarts_schedule_with_warmup,
-        get_linear_schedule_with_warmup,
-    )
-
-
-def unwrap_schedule(scheduler, num_steps=10):
-    lrs = []
-    for _ in range(num_steps):
-        scheduler.step()
-        lrs.append(scheduler.get_lr())
-    return lrs
-
-
-def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
-    lrs = []
-    for step in range(num_steps):
-        scheduler.step()
-        lrs.append(scheduler.get_lr())
-        if step == num_steps // 2:
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                file_name = os.path.join(tmpdirname, "schedule.bin")
-                torch.save(scheduler.state_dict(), file_name)
-
-                state_dict = torch.load(file_name)
-                scheduler.load_state_dict(state_dict)
-    return lrs
-
-
-@require_torch
-class OptimizationTest(unittest.TestCase):
-    def assertListAlmostEqual(self, list1, list2, tol):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
-
-    def test_adam_w(self):
-        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
-        target = torch.tensor([0.4, 0.2, -0.5])
-        criterion = torch.nn.MSELoss()
-        # No warmup, constant schedule, no gradient clipping
-        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
-        for _ in range(100):
-            loss = criterion(w, target)
-            loss.backward()
-            optimizer.step()
-            w.grad.detach_()  # No zero_grad() function on simple tensors. we do it ourselves.
-            w.grad.zero_()
-        self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
-
-
-@require_torch
-class ScheduleInitTest(unittest.TestCase):
-    m = torch.nn.Linear(50, 50) if is_torch_available() else None
-    optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
-    num_steps = 10
-
-    def assertListAlmostEqual(self, list1, list2, tol):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
-
-    def test_constant_scheduler(self):
-        scheduler = get_constant_schedule(self.optimizer)
-        lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [10.0] * self.num_steps
-        self.assertEqual(len(lrs[0]), 1)
-        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
-
-        scheduler = get_constant_schedule(self.optimizer)
-        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
-        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
-
-    def test_warmup_constant_scheduler(self):
-        scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4)
-        lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
-        self.assertEqual(len(lrs[0]), 1)
-        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
-
-        scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4)
-        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
-        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
-
-    def test_warmup_linear_scheduler(self):
-        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
-        lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
-        self.assertEqual(len(lrs[0]), 1)
-        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
-
-        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
-        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
-        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
-
-    def test_warmup_cosine_scheduler(self):
-        scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
-        lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
-        self.assertEqual(len(lrs[0]), 1)
-        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
-
-        scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
-        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
-        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
-
-    def test_warmup_cosine_hard_restart_scheduler(self):
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
-            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
-        )
-        lrs = unwrap_schedule(scheduler, self.num_steps)
-        expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
-        self.assertEqual(len(lrs[0]), 1)
-        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
-
-        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
-            self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10
-        )
-        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
-        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
diff --git a/server/transformers/tests/test_optimization_tf.py b/server/transformers/tests/test_optimization_tf.py
deleted file mode 100644
index 6236c312967c04f311ff721b224d9a005ba8e98b..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_optimization_tf.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import unittest
-
-from transformers import is_tf_available
-
-from .utils import require_tf
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from tensorflow.python.eager import context
-    from tensorflow.python.framework import ops
-    from transformers import create_optimizer, GradientAccumulator
-
-
-@require_tf
-class OptimizationFTest(unittest.TestCase):
-    def assertListAlmostEqual(self, list1, list2, tol):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
-
-    def testGradientAccumulator(self):
-        accumulator = GradientAccumulator()
-        accumulator([tf.constant([1.0, 2.0])])
-        accumulator([tf.constant([-2.0, 1.0])])
-        accumulator([tf.constant([-1.0, 2.0])])
-        with self.assertRaises(ValueError):
-            accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
-        self.assertEqual(accumulator.step, 3)
-        self.assertEqual(len(accumulator.gradients), 1)
-        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
-        accumulator.reset()
-        self.assertEqual(accumulator.step, 0)
-        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
-
-    def testGradientAccumulatorDistributionStrategy(self):
-        context._context = None
-        ops.enable_eager_execution_internal()
-        physical_devices = tf.config.experimental.list_physical_devices("CPU")
-        tf.config.experimental.set_virtual_device_configuration(
-            physical_devices[0],
-            [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()],
-        )
-
-        devices = tf.config.experimental.list_logical_devices(device_type="CPU")
-        strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
-
-        with strategy.scope():
-            accumulator = GradientAccumulator()
-            variable = tf.Variable([4.0, 3.0])
-            optimizer = create_optimizer(5e-5, 10, 5)
-            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
-
-        def accumulate_on_replica(gradient):
-            accumulator([gradient])
-
-        def apply_on_replica():
-            optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])), 1.0)
-
-        @tf.function
-        def accumulate(grad1, grad2):
-            with strategy.scope():
-                gradient_placeholder.values[0].assign(grad1)
-                gradient_placeholder.values[1].assign(grad2)
-                strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,))
-
-        @tf.function
-        def apply_grad():
-            with strategy.scope():
-                strategy.experimental_run_v2(apply_on_replica)
-
-        accumulate([1.0, 2.0], [-1.0, 1.0])
-        accumulate([3.0, -1.0], [-1.0, -1.0])
-        accumulate([-2.0, 2.0], [3.0, -2.0])
-        self.assertEqual(accumulator.step, 3)
-        self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [2.0, 3.0], tol=1e-2)
-        self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [1.0, -2.0], tol=1e-2)
-        apply_grad()
-        self.assertListAlmostEqual(variable.value().numpy().tolist(), [4.0, 3.0], tol=1e-2)
-        accumulator.reset()
-        self.assertEqual(accumulator.step, 0)
-        self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
-        self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
diff --git a/server/transformers/tests/test_pipelines.py b/server/transformers/tests/test_pipelines.py
deleted file mode 100644
index 3a4535d153828820d8973af36c487750ff95a13f..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_pipelines.py
+++ /dev/null
@@ -1,297 +0,0 @@
-import unittest
-from typing import Iterable, List, Optional
-
-from transformers import pipeline
-from transformers.pipelines import Pipeline
-
-from .utils import require_tf, require_torch
-
-
-QA_FINETUNED_MODELS = {
-    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
-    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
-    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
-}
-
-TF_QA_FINETUNED_MODELS = {
-    ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
-    ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
-    ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None),
-}
-
-TF_NER_FINETUNED_MODELS = {
-    (
-        "bert-base-cased",
-        "dbmdz/bert-large-cased-finetuned-conll03-english",
-        "dbmdz/bert-large-cased-finetuned-conll03-english",
-    )
-}
-
-NER_FINETUNED_MODELS = {
-    (
-        "bert-base-cased",
-        "dbmdz/bert-large-cased-finetuned-conll03-english",
-        "dbmdz/bert-large-cased-finetuned-conll03-english",
-    )
-}
-
-FEATURE_EXTRACT_FINETUNED_MODELS = {
-    ("bert-base-cased", "bert-base-cased", None),
-    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-    ("distilbert-base-uncased", "distilbert-base-uncased", None),
-}
-
-TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
-    ("bert-base-cased", "bert-base-cased", None),
-    # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
-    ("distilbert-base-uncased", "distilbert-base-uncased", None),
-}
-
-TF_TEXT_CLASSIF_FINETUNED_MODELS = {
-    (
-        "bert-base-uncased",
-        "distilbert-base-uncased-finetuned-sst-2-english",
-        "distilbert-base-uncased-finetuned-sst-2-english",
-    )
-}
-
-TEXT_CLASSIF_FINETUNED_MODELS = {
-    (
-        "bert-base-uncased",
-        "distilbert-base-uncased-finetuned-sst-2-english",
-        "distilbert-base-uncased-finetuned-sst-2-english",
-    )
-}
-
-FILL_MASK_FINETUNED_MODELS = {
-    ("distilroberta-base", "distilroberta-base", None),
-}
-
-TF_FILL_MASK_FINETUNED_MODELS = {
-    ("distilroberta-base", "distilroberta-base", None),
-}
-
-
-class MonoColumnInputTestCase(unittest.TestCase):
-    def _test_mono_column_pipeline(
-        self,
-        nlp: Pipeline,
-        valid_inputs: List,
-        invalid_inputs: List,
-        output_keys: Iterable[str],
-        expected_multi_result: Optional[List] = None,
-        expected_check_keys: Optional[List[str]] = None,
-    ):
-        self.assertIsNotNone(nlp)
-
-        mono_result = nlp(valid_inputs[0])
-        self.assertIsInstance(mono_result, list)
-        self.assertIsInstance(mono_result[0], (dict, list))
-
-        if isinstance(mono_result[0], list):
-            mono_result = mono_result[0]
-
-        for key in output_keys:
-            self.assertIn(key, mono_result[0])
-
-        multi_result = nlp(valid_inputs)
-        self.assertIsInstance(multi_result, list)
-        self.assertIsInstance(multi_result[0], (dict, list))
-
-        if expected_multi_result is not None:
-            for result, expect in zip(multi_result, expected_multi_result):
-                for key in expected_check_keys or []:
-                    self.assertEqual(
-                        set([o[key] for o in result]), set([o[key] for o in expect]),
-                    )
-
-        if isinstance(multi_result[0], list):
-            multi_result = multi_result[0]
-
-        for result in multi_result:
-            for key in output_keys:
-                self.assertIn(key, result)
-
-        self.assertRaises(Exception, nlp, invalid_inputs)
-
-    @require_torch
-    def test_ner(self):
-        mandatory_keys = {"entity", "word", "score"}
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in NER_FINETUNED_MODELS:
-            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
-
-    @require_tf
-    def test_tf_ner(self):
-        mandatory_keys = {"entity", "word", "score"}
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
-            nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
-
-    @require_torch
-    def test_sentiment_analysis(self):
-        mandatory_keys = {"label", "score"}
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
-
-    @require_tf
-    def test_tf_sentiment_analysis(self):
-        mandatory_keys = {"label", "score"}
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
-            nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
-
-    @require_torch
-    def test_feature_extraction(self):
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
-
-    @require_tf
-    def test_tf_feature_extraction(self):
-        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
-        invalid_inputs = [None]
-        for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
-            nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
-            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
-
-    @require_torch
-    def test_fill_mask(self):
-        mandatory_keys = {"sequence", "score", "token"}
-        valid_inputs = [
-            "My name is <mask>",
-            "The largest city in France is <mask>",
-        ]
-        invalid_inputs = [None]
-        expected_multi_result = [
-            [
-                {"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
-                {"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
-            ],
-            [
-                {"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
-                {
-                    "score": 0.19764970242977142,
-                    "sequence": "<s>The largest city in France is Lyon</s>",
-                    "token": 12790,
-                },
-            ],
-        ]
-        for tokenizer, model, config in FILL_MASK_FINETUNED_MODELS:
-            nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
-            self._test_mono_column_pipeline(
-                nlp,
-                valid_inputs,
-                invalid_inputs,
-                mandatory_keys,
-                expected_multi_result=expected_multi_result,
-                expected_check_keys=["sequence"],
-            )
-
-    @require_tf
-    def test_tf_fill_mask(self):
-        mandatory_keys = {"sequence", "score", "token"}
-        valid_inputs = [
-            "My name is <mask>",
-            "The largest city in France is <mask>",
-        ]
-        invalid_inputs = [None]
-        expected_multi_result = [
-            [
-                {"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
-                {"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
-            ],
-            [
-                {"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
-                {
-                    "score": 0.19764970242977142,
-                    "sequence": "<s>The largest city in France is Lyon</s>",
-                    "token": 12790,
-                },
-            ],
-        ]
-        for tokenizer, model, config in TF_FILL_MASK_FINETUNED_MODELS:
-            nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
-            self._test_mono_column_pipeline(
-                nlp,
-                valid_inputs,
-                invalid_inputs,
-                mandatory_keys,
-                expected_multi_result=expected_multi_result,
-                expected_check_keys=["sequence"],
-            )
-
-
-class MultiColumnInputTestCase(unittest.TestCase):
-    def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
-        self.assertIsNotNone(nlp)
-
-        mono_result = nlp(valid_inputs[0])
-        self.assertIsInstance(mono_result, dict)
-
-        for key in output_keys:
-            self.assertIn(key, mono_result)
-
-        multi_result = nlp(valid_inputs)
-        self.assertIsInstance(multi_result, list)
-        self.assertIsInstance(multi_result[0], dict)
-
-        for result in multi_result:
-            for key in output_keys:
-                self.assertIn(key, result)
-
-        self.assertRaises(Exception, nlp, invalid_inputs[0])
-        self.assertRaises(Exception, nlp, invalid_inputs)
-
-    @require_torch
-    def test_question_answering(self):
-        mandatory_output_keys = {"score", "answer", "start", "end"}
-        valid_samples = [
-            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
-            {
-                "question": "In what field is HuggingFace working ?",
-                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
-            },
-        ]
-        invalid_samples = [
-            {"question": "", "context": "This is a test to try empty question edge case"},
-            {"question": None, "context": "This is a test to try empty question edge case"},
-            {"question": "What is does with empty context ?", "context": ""},
-            {"question": "What is does with empty context ?", "context": None},
-        ]
-
-        for tokenizer, model, config in QA_FINETUNED_MODELS:
-            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
-            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
-
-    @require_tf
-    def test_tf_question_answering(self):
-        mandatory_output_keys = {"score", "answer", "start", "end"}
-        valid_samples = [
-            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
-            {
-                "question": "In what field is HuggingFace working ?",
-                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
-            },
-        ]
-        invalid_samples = [
-            {"question": "", "context": "This is a test to try empty question edge case"},
-            {"question": None, "context": "This is a test to try empty question edge case"},
-            {"question": "What is does with empty context ?", "context": ""},
-            {"question": "What is does with empty context ?", "context": None},
-        ]
-
-        for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
-            nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer)
-            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
diff --git a/server/transformers/tests/test_tokenization_albert.py b/server/transformers/tests/test_tokenization_albert.py
deleted file mode 100644
index c190d8ed826330e5c88d9be09c25a8a406b86b3e..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_albert.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding=utf-8
-# Copyright 2019 Hugging Face inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_albert import AlbertTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
-
-
-class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = AlbertTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
diff --git a/server/transformers/tests/test_tokenization_auto.py b/server/transformers/tests/test_tokenization_auto.py
deleted file mode 100644
index 5ce9228287046e066172eba3c91d0788fda63918..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_auto.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import unittest
-
-from transformers import (
-    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    AutoTokenizer,
-    BertTokenizer,
-    GPT2Tokenizer,
-    RobertaTokenizer,
-)
-from transformers.tokenization_auto import TOKENIZER_MAPPING
-
-from .utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, slow  # noqa: F401
-
-
-class AutoTokenizerTest(unittest.TestCase):
-    # @slow
-    def test_tokenizer_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.assertIsNotNone(tokenizer)
-            self.assertIsInstance(tokenizer, BertTokenizer)
-            self.assertGreater(len(tokenizer), 0)
-
-        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.assertIsNotNone(tokenizer)
-            self.assertIsInstance(tokenizer, GPT2Tokenizer)
-            self.assertGreater(len(tokenizer), 0)
-
-    def test_tokenizer_from_pretrained_identifier(self):
-        logging.basicConfig(level=logging.INFO)
-        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(tokenizer, BertTokenizer)
-        self.assertEqual(len(tokenizer), 12)
-
-    def test_tokenizer_from_model_type(self):
-        logging.basicConfig(level=logging.INFO)
-        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
-        self.assertIsInstance(tokenizer, RobertaTokenizer)
-        self.assertEqual(len(tokenizer), 20)
-
-    def test_tokenizer_identifier_with_correct_config(self):
-        logging.basicConfig(level=logging.INFO)
-        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
-            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
-            self.assertIsInstance(tokenizer, BertTokenizer)
-            self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
-            self.assertEqual(tokenizer.max_len, 512)
-
-    def test_tokenizer_identifier_non_existent(self):
-        logging.basicConfig(level=logging.INFO)
-        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
-            with self.assertRaises(EnvironmentError):
-                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
-
-    def test_parents_and_children_in_mappings(self):
-        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
-        # by the parents and will return the wrong configuration type when using auto models
-
-        mappings = (TOKENIZER_MAPPING,)
-
-        for mapping in mappings:
-            mapping = tuple(mapping.items())
-            for index, (child_config, child_model) in enumerate(mapping[1:]):
-                for parent_config, parent_model in mapping[: index + 1]:
-                    with self.subTest(
-                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
-                    ):
-                        self.assertFalse(issubclass(child_config, parent_config))
-                        self.assertFalse(issubclass(child_model, parent_model))
diff --git a/server/transformers/tests/test_tokenization_bert.py b/server/transformers/tests/test_tokenization_bert.py
deleted file mode 100644
index 49bb073351d150ee2737783defe61c35814ccc22..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_bert.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    BertTokenizer,
-    BertTokenizerFast,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import slow
-
-
-class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = BertTokenizer
-    test_rust_tokenizer = True
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_special_tokens=False)
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for (i, token) in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
diff --git a/server/transformers/tests/test_tokenization_bert_japanese.py b/server/transformers/tests/test_tokenization_bert_japanese.py
deleted file mode 100644
index 4900ff49da50690e129038716a03d558ba614b9e..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_bert_japanese.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_bert import WordpieceTokenizer
-from transformers.tokenization_bert_japanese import (
-    VOCAB_FILES_NAMES,
-    BertJapaneseTokenizer,
-    CharacterTokenizer,
-    MecabTokenizer,
-)
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import custom_tokenizers, slow
-
-
-@custom_tokenizers
-class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = BertJapaneseTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "こんにちは",
-            "こん",
-            "にちは",
-            "ばんは",
-            "##こん",
-            "##にちは",
-            "##ばんは",
-            "世界",
-            "##世界",
-            "、",
-            "##、",
-            "。",
-            "##。",
-        ]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "こんにちは、世界。 \nこんばんは、世界。"
-        output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
-        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
-
-    def test_mecab_tokenizer(self):
-        tokenizer = MecabTokenizer()
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
-        )
-
-    def test_mecab_tokenizer_lower(self):
-        tokenizer = MecabTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-            ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
-        )
-
-    def test_mecab_tokenizer_no_normalize(self):
-        tokenizer = MecabTokenizer(normalize_text=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
-            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れ", "た", "　", "。"],
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"]
-
-        vocab = {}
-        for (i, token) in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
-
-        self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
-
-        self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
-
-        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        # 2 is for "[CLS]", 3 is for "[SEP]"
-        assert encoded_sentence == [2] + text + [3]
-        assert encoded_pair == [2] + text + [3] + text_2 + [3]
-
-
-class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = BertJapaneseTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "こんにちは、世界。 \nこんばんは、世界。"
-        output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
-
-        tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
-        self.assertListEqual(
-            tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]
-        )
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
-        )
-
-    def test_character_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界" "、", "。"]
-
-        vocab = {}
-        for (i, token) in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"])
-
-        self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"])
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
-
-        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
-        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        # 2 is for "[CLS]", 3 is for "[SEP]"
-        assert encoded_sentence == [2] + text + [3]
-        assert encoded_pair == [2] + text + [3] + text_2 + [3]
diff --git a/server/transformers/tests/test_tokenization_common.py b/server/transformers/tests/test_tokenization_common.py
deleted file mode 100644
index 9867b189915fb4d56fa61c63a833f275e2c99b02..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_common.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import pickle
-import shutil
-import tempfile
-
-
-class TokenizerTesterMixin:
-
-    tokenizer_class = None
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        raise NotImplementedError
-
-    def get_rust_tokenizer(self, **kwargs):
-        raise NotImplementedError
-
-    def get_input_output_texts(self):
-        raise NotImplementedError
-
-    def test_tokenizers_common_properties(self):
-        tokenizer = self.get_tokenizer()
-        attributes_list = [
-            "bos_token",
-            "eos_token",
-            "unk_token",
-            "sep_token",
-            "pad_token",
-            "cls_token",
-            "mask_token",
-        ]
-        for attr in attributes_list:
-            self.assertTrue(hasattr(tokenizer, attr))
-            self.assertTrue(hasattr(tokenizer, attr + "_id"))
-
-        self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
-        self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
-
-        attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
-        for attr in attributes_list:
-            self.assertTrue(hasattr(tokenizer, attr))
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizer = self.get_tokenizer()
-        self.assertNotEqual(tokenizer.max_len, 42)
-
-        # Now let's start the test
-        tokenizer = self.get_tokenizer(max_len=42)
-
-        before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tokenizer.save_pretrained(tmpdirname)
-            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
-
-            after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
-            self.assertListEqual(before_tokens, after_tokens)
-
-            self.assertEqual(tokenizer.max_len, 42)
-            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
-            self.assertEqual(tokenizer.max_len, 43)
-
-    def test_pickle_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        self.assertIsNotNone(tokenizer)
-
-        text = "Munich and Berlin are nice cities"
-        subwords = tokenizer.tokenize(text)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-
-            filename = os.path.join(tmpdirname, "tokenizer.bin")
-            with open(filename, "wb") as handle:
-                pickle.dump(tokenizer, handle)
-
-            with open(filename, "rb") as handle:
-                tokenizer_new = pickle.load(handle)
-
-        subwords_loaded = tokenizer_new.tokenize(text)
-
-        self.assertListEqual(subwords, subwords_loaded)
-
-    def test_added_tokens_do_lower_case(self):
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-
-        special_token = tokenizer.all_special_tokens[0]
-
-        text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
-        text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
-
-        toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
-
-        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
-        added = tokenizer.add_tokens(new_toks)
-        self.assertEqual(added, 2)
-
-        toks = tokenizer.tokenize(text)
-        toks2 = tokenizer.tokenize(text2)
-
-        self.assertEqual(len(toks), len(toks2))
-        self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
-        self.assertListEqual(toks, toks2)
-
-        # Check that none of the special tokens are lowercased
-        sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
-        tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
-
-        for special_token in tokenizer.all_special_tokens:
-            self.assertTrue(special_token in tokenized_sequence)
-
-        tokenizer = self.get_tokenizer(do_lower_case=False)
-
-        added = tokenizer.add_tokens(new_toks)
-        self.assertEqual(added, 4)
-
-        toks = tokenizer.tokenize(text)
-        toks2 = tokenizer.tokenize(text2)
-
-        self.assertEqual(len(toks), len(toks2))  # Length should still be the same
-        self.assertNotEqual(len(toks), len(toks0))
-        self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
-
-    def test_add_tokens_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        vocab_size = tokenizer.vocab_size
-        all_size = len(tokenizer)
-
-        self.assertNotEqual(vocab_size, 0)
-        self.assertEqual(vocab_size, all_size)
-
-        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-        added_toks = tokenizer.add_tokens(new_toks)
-        vocab_size_2 = tokenizer.vocab_size
-        all_size_2 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_2, 0)
-        self.assertEqual(vocab_size, vocab_size_2)
-        self.assertEqual(added_toks, len(new_toks))
-        self.assertEqual(all_size_2, all_size + len(new_toks))
-
-        tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-        self.assertGreaterEqual(len(tokens), 4)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-        new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-        vocab_size_3 = tokenizer.vocab_size
-        all_size_3 = len(tokenizer)
-
-        self.assertNotEqual(vocab_size_3, 0)
-        self.assertEqual(vocab_size, vocab_size_3)
-        self.assertEqual(added_toks_2, len(new_toks_2))
-        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-        tokens = tokenizer.encode(
-            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
-        )
-
-        self.assertGreaterEqual(len(tokens), 6)
-        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[0], tokens[1])
-        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-        self.assertGreater(tokens[-2], tokens[-3])
-        self.assertEqual(tokens[0], tokenizer.eos_token_id)
-        self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_add_special_tokens(self):
-        tokenizer = self.get_tokenizer()
-        input_text, output_text = self.get_input_output_texts()
-
-        special_token = "[SPECIAL TOKEN]"
-
-        tokenizer.add_special_tokens({"cls_token": special_token})
-        encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
-        assert len(encoded_special_token) == 1
-
-        text = " ".join([input_text, special_token, output_text])
-        encoded = tokenizer.encode(text, add_special_tokens=False)
-
-        input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
-        output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
-        special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
-        assert encoded == input_encoded + special_token_id + output_encoded
-
-        decoded = tokenizer.decode(encoded, skip_special_tokens=True)
-        assert special_token not in decoded
-
-    def test_required_methods_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        input_text, output_text = self.get_input_output_texts()
-
-        tokens = tokenizer.tokenize(input_text)
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
-        self.assertListEqual(ids, ids_2)
-
-        tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-        text_2 = tokenizer.decode(ids)
-
-        self.assertEqual(text_2, output_text)
-
-        self.assertNotEqual(len(tokens_2), 0)
-        self.assertIsInstance(text_2, str)
-
-    def test_encode_decode_with_spaces(self):
-        tokenizer = self.get_tokenizer()
-
-        new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
-        tokenizer.add_tokens(new_toks)
-        input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
-        encoded = tokenizer.encode(input, add_special_tokens=False)
-        decoded = tokenizer.decode(encoded)
-        self.assertEqual(decoded, input)
-
-    def test_pretrained_model_lists(self):
-        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
-        weights_lists_2 = []
-        for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
-            weights_lists_2.append(list(map_list.keys()))
-
-        for weights_list_2 in weights_lists_2:
-            self.assertListEqual(weights_list, weights_list_2)
-
-    def test_mask_output(self):
-        tokenizer = self.get_tokenizer()
-
-        if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
-            seq_0 = "Test this method."
-            seq_1 = "With these inputs."
-            information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
-            sequences, mask = information["input_ids"], information["token_type_ids"]
-            self.assertEqual(len(sequences), len(mask))
-
-    def test_number_of_added_tokens(self):
-        tokenizer = self.get_tokenizer()
-
-        seq_0 = "Test this method."
-        seq_1 = "With these inputs."
-
-        sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
-        attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
-
-        # Method is implemented (e.g. not GPT-2)
-        if len(attached_sequences) != 2:
-            self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
-
-    def test_maximum_encoding_length_single_input(self):
-        tokenizer = self.get_tokenizer()
-
-        seq_0 = "This is a sentence to be encoded."
-        stride = 2
-
-        sequence = tokenizer.encode(seq_0, add_special_tokens=False)
-        num_added_tokens = tokenizer.num_added_tokens()
-        total_length = len(sequence) + num_added_tokens
-        information = tokenizer.encode_plus(
-            seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride, return_overflowing_tokens=True,
-        )
-
-        truncated_sequence = information["input_ids"]
-        overflowing_tokens = information["overflowing_tokens"]
-
-        self.assertEqual(len(overflowing_tokens), 2 + stride)
-        self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
-        self.assertEqual(len(truncated_sequence), total_length - 2)
-        self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
-
-    def test_maximum_encoding_length_pair_input(self):
-        tokenizer = self.get_tokenizer()
-
-        seq_0 = "This is a sentence to be encoded."
-        seq_1 = "This is another sentence to be encoded."
-        stride = 2
-
-        sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
-        sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
-
-        sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
-        truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
-            tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
-        )
-
-        information = tokenizer.encode_plus(
-            seq_0,
-            seq_1,
-            max_length=len(sequence) - 2,
-            add_special_tokens=True,
-            stride=stride,
-            truncation_strategy="only_second",
-            return_overflowing_tokens=True,
-        )
-        information_first_truncated = tokenizer.encode_plus(
-            seq_0,
-            seq_1,
-            max_length=len(sequence) - 2,
-            add_special_tokens=True,
-            stride=stride,
-            truncation_strategy="only_first",
-            return_overflowing_tokens=True,
-        )
-
-        truncated_sequence = information["input_ids"]
-        overflowing_tokens = information["overflowing_tokens"]
-        overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
-
-        self.assertEqual(len(overflowing_tokens), 2 + stride)
-        self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
-        self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
-        self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-        self.assertEqual(truncated_sequence, truncated_second_sequence)
-
-    def test_encode_input_type(self):
-        tokenizer = self.get_tokenizer()
-
-        sequence = "Let's encode this sequence"
-
-        tokens = tokenizer.tokenize(sequence)
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
-
-        self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
-        self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
-
-    def test_special_tokens_mask(self):
-        tokenizer = self.get_tokenizer()
-
-        sequence_0 = "Encode this."
-        sequence_1 = "This one too please."
-
-        # Testing single inputs
-        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-        encoded_sequence_dict = tokenizer.encode_plus(
-            sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-        )
-        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-        filtered_sequence = [
-            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-        ]
-        filtered_sequence = [x for x in filtered_sequence if x is not None]
-        self.assertEqual(encoded_sequence, filtered_sequence)
-
-        # Testing inputs pairs
-        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
-            sequence_1, add_special_tokens=False
-        )
-        encoded_sequence_dict = tokenizer.encode_plus(
-            sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
-        )
-        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-        filtered_sequence = [
-            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-        ]
-        filtered_sequence = [x for x in filtered_sequence if x is not None]
-        self.assertEqual(encoded_sequence, filtered_sequence)
-
-        # Testing with already existing special tokens
-        if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
-            tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
-        encoded_sequence_dict = tokenizer.encode_plus(
-            sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-        )
-        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-        special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
-        special_tokens_mask = tokenizer.get_special_tokens_mask(
-            encoded_sequence_w_special, already_has_special_tokens=True
-        )
-        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-        self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
-
-    def test_padding_to_max_length(self):
-        tokenizer = self.get_tokenizer()
-
-        sequence = "Sequence"
-        padding_size = 10
-        padding_idx = tokenizer.pad_token_id
-
-        # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-        tokenizer.padding_side = "right"
-        encoded_sequence = tokenizer.encode(sequence)
-        sequence_length = len(encoded_sequence)
-        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
-        padded_sequence_length = len(padded_sequence)
-        assert sequence_length + padding_size == padded_sequence_length
-        assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
-
-        # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-        tokenizer.padding_side = "left"
-        encoded_sequence = tokenizer.encode(sequence)
-        sequence_length = len(encoded_sequence)
-        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
-        padded_sequence_length = len(padded_sequence)
-        assert sequence_length + padding_size == padded_sequence_length
-        assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
-
-        # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
-        encoded_sequence = tokenizer.encode(sequence)
-        sequence_length = len(encoded_sequence)
-
-        tokenizer.padding_side = "right"
-        padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
-        padded_sequence_right_length = len(padded_sequence_right)
-
-        tokenizer.padding_side = "left"
-        padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
-        padded_sequence_left_length = len(padded_sequence_left)
-
-        assert sequence_length == padded_sequence_right_length
-        assert encoded_sequence == padded_sequence_right
-        assert sequence_length == padded_sequence_left_length
-        assert encoded_sequence == padded_sequence_left
-
-    def test_encode_plus_with_padding(self):
-        tokenizer = self.get_tokenizer()
-
-        sequence = "Sequence"
-        padding_size = 10
-        padding_idx = tokenizer.pad_token_id
-        token_type_padding_idx = tokenizer.pad_token_type_id
-
-        encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
-        input_ids = encoded_sequence["input_ids"]
-        token_type_ids = encoded_sequence["token_type_ids"]
-        attention_mask = encoded_sequence["attention_mask"]
-        special_tokens_mask = encoded_sequence["special_tokens_mask"]
-        sequence_length = len(input_ids)
-
-        # Test right padding
-        tokenizer.padding_side = "right"
-        padded_sequence = tokenizer.encode_plus(
-            sequence,
-            max_length=sequence_length + padding_size,
-            pad_to_max_length=True,
-            return_special_tokens_mask=True,
-        )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
-
-        assert sequence_length + padding_size == padded_sequence_length
-        assert input_ids + [padding_idx] * padding_size == padded_input_ids
-        assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
-        assert attention_mask + [0] * padding_size == padded_attention_mask
-        assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
-
-        # Test left padding
-        tokenizer.padding_side = "left"
-        padded_sequence = tokenizer.encode_plus(
-            sequence,
-            max_length=sequence_length + padding_size,
-            pad_to_max_length=True,
-            return_special_tokens_mask=True,
-        )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
-
-        assert sequence_length + padding_size == padded_sequence_length
-        assert [padding_idx] * padding_size + input_ids == padded_input_ids
-        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
-        assert [0] * padding_size + attention_mask == padded_attention_mask
-        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
-
-    def test_separate_tokenizers(self):
-        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
-        # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
-
-        tokenizer = self.get_tokenizer(random_argument=True)
-        print(tokenizer.init_kwargs)
-        assert tokenizer.init_kwargs["random_argument"] is True
-        new_tokenizer = self.get_tokenizer(random_argument=False)
-        print(tokenizer.init_kwargs)
-        print(new_tokenizer.init_kwargs)
-        assert tokenizer.init_kwargs["random_argument"] is True
-        assert new_tokenizer.init_kwargs["random_argument"] is False
diff --git a/server/transformers/tests/test_tokenization_ctrl.py b/server/transformers/tests/test_tokenization_ctrl.py
deleted file mode 100644
index 8b57dc49d347c3515e9c30804c660640c20ccf0c..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_ctrl.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = CTRLTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "adapt react readapt apt"
-        output_text = "adapt react readapt apt"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "adapt react readapt apt"
-        bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split()
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-
-        input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/server/transformers/tests/test_tokenization_distilbert.py b/server/transformers/tests/test_tokenization_distilbert.py
deleted file mode 100644
index a142b8d8f92f0dee5bc747929f78895fb6a3f9ad..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_distilbert.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from transformers.tokenization_distilbert import DistilBertTokenizer
-
-from .test_tokenization_bert import BertTokenizationTest
-from .utils import slow
-
-
-class DistilBertTokenizationTest(BertTokenizationTest):
-
-    tokenizer_class = DistilBertTokenizer
-
-    def get_tokenizer(self, **kwargs):
-        return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
diff --git a/server/transformers/tests/test_tokenization_gpt2.py b/server/transformers/tests/test_tokenization_gpt2.py
deleted file mode 100644
index 12b7b0eeb1674f4719246ecadeea3c5fc823a5dc..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_gpt2.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer, GPT2TokenizerFast
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = GPT2Tokenizer
-    test_rust_tokenizer = True
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text, add_prefix_space=True)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_special_tokens=False, add_prefix_space=True)
-
-        sequence = "lower newer"
-
-        # Testing tokenization
-        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        # Testing conversion to ids without special tokens
-        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing conversion to ids with special tokens
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-        ids = tokenizer.encode(sequence, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing the unknown token
-        input_tokens = tokens + [rust_tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/server/transformers/tests/test_tokenization_openai.py b/server/transformers/tests/test_tokenization_openai.py
deleted file mode 100644
index f89ec61ff61153f244adc47ea8c777cd404593d8..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_openai.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = OpenAIGPTTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "w</w>",
-            "r</w>",
-            "t</w>",
-            "lo",
-            "low",
-            "er</w>",
-            "low</w>",
-            "lowest</w>",
-            "newer</w>",
-            "wider</w>",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
-
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/server/transformers/tests/test_tokenization_roberta.py b/server/transformers/tests/test_tokenization_roberta.py
deleted file mode 100644
index f9abdea66623af2b9aa2aeca27d18dfdd7b9d5e2..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_roberta.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import slow
-
-
-class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = RobertaTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text, add_prefix_space=True)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def roberta_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
-        encoded_pair_from_decode = tokenizer.encode(
-            "sequence builders", "multi-sequence build", add_special_tokens=True
-        )
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == encoded_text_from_decode
-        assert encoded_pair == encoded_pair_from_decode
diff --git a/server/transformers/tests/test_tokenization_t5.py b/server/transformers/tests/test_tokenization_t5.py
deleted file mode 100644
index 793d80ac646ac23718d50917d40a12a8408c0b8c..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_t5.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_t5 import T5Tokenizer
-from transformers.tokenization_xlnet import SPIECE_UNDERLINE
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
-
-
-class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = T5Tokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "This is a test"
-        output_text = "This is a test"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "<unk>",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "<unk>",
-                ".",
-            ],
-        )
diff --git a/server/transformers/tests/test_tokenization_transfo_xl.py b/server/transformers/tests/test_tokenization_transfo_xl.py
deleted file mode 100644
index 8d4814699e086a8363c003fcf475bdba53734602..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_transfo_xl.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers import is_torch_available
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import require_torch
-
-
-if is_torch_available():
-    from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
-
-
-@require_torch
-class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "<unk>",
-            "[CLS]",
-            "[SEP]",
-            "want",
-            "unwanted",
-            "wa",
-            "un",
-            "running",
-            ",",
-            "low",
-            "l",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs["lower_case"] = True
-        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "<unk> UNwanted , running"
-        output_text = "<unk> unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
-
-        tokens = tokenizer.tokenize("<unk> UNwanted , running")
-        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
-
-    def test_full_tokenizer_lower(self):
-        tokenizer = TransfoXLTokenizer(lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-
-    def test_full_tokenizer_no_lower(self):
-        tokenizer = TransfoXLTokenizer(lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
diff --git a/server/transformers/tests/test_tokenization_utils.py b/server/transformers/tests/test_tokenization_utils.py
deleted file mode 100644
index 2909b4f9daa4bf2f80e01ef6966585f46beace23..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_utils.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import PreTrainedTokenizer
-from transformers.tokenization_gpt2 import GPT2Tokenizer
-
-from .utils import slow
-
-
-class TokenizerUtilsTest(unittest.TestCase):
-    def check_tokenizer_from_pretrained(self, tokenizer_class):
-        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
-        for model_name in s3_models[:1]:
-            tokenizer = tokenizer_class.from_pretrained(model_name)
-            self.assertIsNotNone(tokenizer)
-            self.assertIsInstance(tokenizer, tokenizer_class)
-            self.assertIsInstance(tokenizer, PreTrainedTokenizer)
-
-            for special_tok in tokenizer.all_special_tokens:
-                self.assertIsInstance(special_tok, str)
-                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
-                self.assertIsInstance(special_tok_id, int)
-
-    @slow
-    def test_pretrained_tokenizers(self):
-        self.check_tokenizer_from_pretrained(GPT2Tokenizer)
diff --git a/server/transformers/tests/test_tokenization_xlm.py b/server/transformers/tests/test_tokenization_xlm.py
deleted file mode 100644
index 5fd7379388b54abc299d2527809b71a0bb2f7d47..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_xlm.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import slow
-
-
-class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = XLMTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "w</w>",
-            "r</w>",
-            "t</w>",
-            "lo",
-            "low",
-            "er</w>",
-            "low</w>",
-            "lowest</w>",
-            "newer</w>",
-            "wider</w>",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
-        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
-
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [1] + text + [1]
-        assert encoded_pair == [1] + text + [1] + text_2 + [1]
diff --git a/server/transformers/tests/test_tokenization_xlnet.py b/server/transformers/tests/test_tokenization_xlnet.py
deleted file mode 100644
index 2fa94bfbc928dbad0ae1c2f6c6ed2f5dc6ab1326..0000000000000000000000000000000000000000
--- a/server/transformers/tests/test_tokenization_xlnet.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
-
-from .test_tokenization_common import TokenizerTesterMixin
-from .utils import slow
-
-
-SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
-
-
-class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = XLNetTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self):
-        input_text = "This is a test"
-        output_text = "This is a test"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "<unk>",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "<unk>",
-                ".",
-            ],
-        )
-
-    def test_tokenizer_lower(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "",
-                "i",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "se",
-                ".",
-            ],
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
-
-    def test_tokenizer_no_lower(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "se",
-                ".",
-            ],
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == text + [4, 3]
-        assert encoded_pair == text + [4] + text_2 + [4, 3]
diff --git a/server/transformers/tests/utils.py b/server/transformers/tests/utils.py
deleted file mode 100644
index 163628d3a7d682d59f66e8fe038e360daa602308..0000000000000000000000000000000000000000
--- a/server/transformers/tests/utils.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import tempfile
-import unittest
-from distutils.util import strtobool
-
-from transformers.file_utils import _tf_available, _torch_available
-
-
-CACHE_DIR = os.path.join(tempfile.gettempdir(), "transformers_test")
-
-SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
-DUMMY_UNKWOWN_IDENTIFIER = "julien-c/dummy-unknown"
-# Used to test Auto{Config, Model, Tokenizer} model_type detection.
-
-
-def parse_flag_from_env(key, default=False):
-    try:
-        value = os.environ[key]
-    except KeyError:
-        # KEY isn't set, default to `default`.
-        _value = default
-    else:
-        # KEY is set, convert it to True or False.
-        try:
-            _value = strtobool(value)
-        except ValueError:
-            # More values are supported, but let's keep the message simple.
-            raise ValueError("If set, {} must be yes or no.".format(key))
-    return _value
-
-
-_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
-_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
-
-
-def slow(test_case):
-    """
-    Decorator marking a test as slow.
-
-    Slow tests are skipped by default. Set the RUN_SLOW environment variable
-    to a truthy value to run them.
-
-    """
-    if not _run_slow_tests:
-        test_case = unittest.skip("test is slow")(test_case)
-    return test_case
-
-
-def custom_tokenizers(test_case):
-    """
-    Decorator marking a test for a custom tokenizer.
-
-    Custom tokenizers require additional dependencies, and are skipped
-    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
-    to a truthy value to run them.
-    """
-    if not _run_custom_tokenizers:
-        test_case = unittest.skip("test of custom tokenizers")(test_case)
-    return test_case
-
-
-def require_torch(test_case):
-    """
-    Decorator marking a test that requires PyTorch.
-
-    These tests are skipped when PyTorch isn't installed.
-
-    """
-    if not _torch_available:
-        test_case = unittest.skip("test requires PyTorch")(test_case)
-    return test_case
-
-
-def require_tf(test_case):
-    """
-    Decorator marking a test that requires TensorFlow.
-
-    These tests are skipped when TensorFlow isn't installed.
-
-    """
-    if not _tf_available:
-        test_case = unittest.skip("test requires TensorFlow")(test_case)
-    return test_case
-
-
-if _torch_available:
-    # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
-else:
-    torch_device = None
diff --git a/server/transformers/transformers-cli b/server/transformers/transformers-cli
deleted file mode 100755
index 9813b838433252821ec44e726275326e55bbc3c8..0000000000000000000000000000000000000000
--- a/server/transformers/transformers-cli
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python
-from argparse import ArgumentParser
-
-from transformers.commands.convert import ConvertCommand
-from transformers.commands.download import DownloadCommand
-from transformers.commands.env import EnvironmentCommand
-from transformers.commands.run import RunCommand
-from transformers.commands.serving import ServeCommand
-from transformers.commands.user import UserCommands
-
-if __name__ == '__main__':
-    parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
-    commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
-
-    # Register commands
-    ConvertCommand.register_subcommand(commands_parser)
-    DownloadCommand.register_subcommand(commands_parser)
-    EnvironmentCommand.register_subcommand(commands_parser)
-    RunCommand.register_subcommand(commands_parser)
-    ServeCommand.register_subcommand(commands_parser)
-    UserCommands.register_subcommand(commands_parser)
-
-    # Let's go
-    args = parser.parse_args()
-
-    if not hasattr(args, 'func'):
-        parser.print_help()
-        exit(1)
-
-    # Run
-    service = args.func(args)
-    service.run()
diff --git a/server/transformers/utils/download_glue_data.py b/server/transformers/utils/download_glue_data.py
deleted file mode 100644
index b46cbcd7b22f00547e93f98be035f98aaf59e18a..0000000000000000000000000000000000000000
--- a/server/transformers/utils/download_glue_data.py
+++ /dev/null
@@ -1,154 +0,0 @@
-""" Script for downloading all GLUE data.
-Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
-
-Note: for legal reasons, we are unable to host MRPC.
-You can either use the version hosted by the SentEval team, which is already tokenized,
-or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
-For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
-You should then rename and place specific files in a folder (see below for an example).
-
-mkdir MRPC
-cabextract MSRParaphraseCorpus.msi -d MRPC
-cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
-cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
-rm MRPC/_*
-rm MSRParaphraseCorpus.msi
-
-1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
-2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
-"""
-
-import argparse
-import os
-import sys
-import urllib.request
-import zipfile
-
-
-TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
-TASK2PATH = {
-    "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",
-    "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8",
-    "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
-    "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5",
-    "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5",
-    "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce",
-    "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df",
-    "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601",
-    "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb",
-    "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf",
-    "diagnostic": "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D",
-}
-
-MRPC_TRAIN = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt"
-MRPC_TEST = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
-
-
-def download_and_extract(task, data_dir):
-    print("Downloading and extracting %s..." % task)
-    data_file = "%s.zip" % task
-    urllib.request.urlretrieve(TASK2PATH[task], data_file)
-    with zipfile.ZipFile(data_file) as zip_ref:
-        zip_ref.extractall(data_dir)
-    os.remove(data_file)
-    print("\tCompleted!")
-
-
-def format_mrpc(data_dir, path_to_data):
-    print("Processing MRPC...")
-    mrpc_dir = os.path.join(data_dir, "MRPC")
-    if not os.path.isdir(mrpc_dir):
-        os.mkdir(mrpc_dir)
-    if path_to_data:
-        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
-        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
-    else:
-        print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
-        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
-        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-        urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
-        urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
-    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
-    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
-    urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
-
-    dev_ids = []
-    with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
-        for row in ids_fh:
-            dev_ids.append(row.strip().split("\t"))
-
-    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
-        os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
-    ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
-        header = data_fh.readline()
-        train_fh.write(header)
-        dev_fh.write(header)
-        for row in data_fh:
-            label, id1, id2, s1, s2 = row.strip().split("\t")
-            if [id1, id2] in dev_ids:
-                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-            else:
-                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-
-    with open(mrpc_test_file, encoding="utf8") as data_fh, open(
-        os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
-    ) as test_fh:
-        header = data_fh.readline()
-        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
-        for idx, row in enumerate(data_fh):
-            label, id1, id2, s1, s2 = row.strip().split("\t")
-            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
-    print("\tCompleted!")
-
-
-def download_diagnostic(data_dir):
-    print("Downloading and extracting diagnostic...")
-    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
-        os.mkdir(os.path.join(data_dir, "diagnostic"))
-    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
-    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
-    print("\tCompleted!")
-    return
-
-
-def get_tasks(task_names):
-    task_names = task_names.split(",")
-    if "all" in task_names:
-        tasks = TASKS
-    else:
-        tasks = []
-        for task_name in task_names:
-            assert task_name in TASKS, "Task %s not found!" % task_name
-            tasks.append(task_name)
-    return tasks
-
-
-def main(arguments):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data_dir", help="directory to save data to", type=str, default="glue_data")
-    parser.add_argument(
-        "--tasks", help="tasks to download data for as a comma separated string", type=str, default="all"
-    )
-    parser.add_argument(
-        "--path_to_mrpc",
-        help="path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt",
-        type=str,
-        default="",
-    )
-    args = parser.parse_args(arguments)
-
-    if not os.path.isdir(args.data_dir):
-        os.mkdir(args.data_dir)
-    tasks = get_tasks(args.tasks)
-
-    for task in tasks:
-        if task == "MRPC":
-            format_mrpc(args.data_dir, args.path_to_mrpc)
-        elif task == "diagnostic":
-            download_diagnostic(args.data_dir)
-        else:
-            download_and_extract(task, args.data_dir)
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/server/transformers/utils/link_tester.py b/server/transformers/utils/link_tester.py
deleted file mode 100644
index 0ef165c401b84f8b15ac9a7eea1e699a888b77fd..0000000000000000000000000000000000000000
--- a/server/transformers/utils/link_tester.py
+++ /dev/null
@@ -1,79 +0,0 @@
-""" Link tester.
-
-This little utility reads all the python files in the repository,
-scans for links pointing to S3 and tests the links one by one. Raises an error
-at the end of the scan if at least one link was reported broken.
-"""
-import os
-import re
-import sys
-
-import requests
-
-
-REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
-
-
-def list_python_files_in_repository():
-    """ List all python files in the repository.
-
-    This function assumes that the script is executed in the root folder.
-    """
-    source_code_files = []
-    for path, subdirs, files in os.walk("."):
-        if "templates" in path:
-            continue
-        for name in files:
-            if ".py" in name and ".pyc" not in name:
-                path_to_files = os.path.join(path, name)
-                source_code_files.append(path_to_files)
-
-    return source_code_files
-
-
-def find_all_links(file_paths):
-    links = []
-    for path in file_paths:
-        links += scan_code_for_links(path)
-
-    return links
-
-
-def scan_code_for_links(source):
-    """ Scans the file to find links using a regular expression.
-    Returns a list of links.
-    """
-    with open(source, "r") as content:
-        content = content.read()
-        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
-        links = [prefix + suffix for _, prefix, suffix in raw_links]
-
-    return links
-
-
-def check_all_links(links):
-    """ Check that the provided links are valid.
-
-    Links are considered valid if a HEAD request to the server
-    returns a 200 status code.
-    """
-    broken_links = []
-    for link in links:
-        head = requests.head(link)
-        if head.status_code != 200:
-            broken_links.append(link)
-
-    return broken_links
-
-
-if __name__ == "__main__":
-    file_paths = list_python_files_in_repository()
-    links = find_all_links(file_paths)
-    broken_links = check_all_links(links)
-    print("Looking for broken links to pre-trained models/configs/tokenizers...")
-    if broken_links:
-        print("The following links did not respond:")
-        for link in broken_links:
-            print("- {}".format(link))
-        sys.exit(1)
-    print("All links are ok.")
diff --git a/server/transformers/valohai.yaml b/server/transformers/valohai.yaml
deleted file mode 100644
index 2573551b4e23d6f2243f4584f2c20007fed155f2..0000000000000000000000000000000000000000
--- a/server/transformers/valohai.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
----
-
-- step:
-    name: Execute python examples/run_glue.py
-    image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
-    command:
-      - python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data
-      - pip install -e .
-      - pip install -r examples/requirements.txt
-      - python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters}
-    parameters:
-      - name: model_type
-        pass-as: --model_type={v}
-        type: string
-        default: bert
-      - name: model_name_or_path
-        pass-as: --model_name_or_path={v}
-        type: string
-        default: bert-base-uncased
-      - name: task_name
-        pass-as: --task_name={v}
-        type: string
-        default: MRPC
-      - name: max_seq_length
-        pass-as: --max_seq_length={v}
-        description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
-        type: integer
-        default: 128
-      - name: per_gpu_train_batch_size
-        pass-as: --per_gpu_train_batch_size={v}
-        description: Batch size per GPU/CPU for training.
-        type: integer
-        default: 8
-      - name: per_gpu_eval_batch_size
-        pass-as: --per_gpu_eval_batch_size={v}
-        description: Batch size per GPU/CPU for evaluation.
-        type: integer
-        default: 8
-      - name: gradient_accumulation_steps
-        pass-as: --gradient_accumulation_steps={v}
-        description: Number of updates steps to accumulate before performing a backward/update pass.
-        type: integer
-        default: 1
-      - name: learning_rate
-        pass-as: --learning_rate={v}
-        description: The initial learning rate for Adam.
-        type: float
-        default: 0.00005
-      - name: adam_epsilon
-        pass-as: --adam_epsilon={v}
-        description: Epsilon for Adam optimizer.
-        type: float
-        default: 0.00000001
-      - name: max_grad_norm
-        pass-as: --max_grad_norm={v}
-        description: Max gradient norm.
-        type: float
-        default: 1.0
-      - name: num_train_epochs
-        pass-as: --num_train_epochs={v}
-        description: Total number of training epochs to perform.
-        type: integer
-        default: 3
-      - name: max_steps
-        pass-as: --max_steps={v}
-        description: If > 0, set total number of training steps to perform. Override num_train_epochs.
-        type: integer
-        default: -1
-      - name: warmup_steps
-        pass-as: --warmup_steps={v}
-        description: Linear warmup over warmup_steps.
-        type: integer
-        default: -1
-      - name: logging_steps
-        pass-as: --logging_steps={v}
-        description: Log every X updates steps.
-        type: integer
-        default: 25
-      - name: save_steps
-        pass-as: --save_steps={v}
-        description: Save checkpoint every X updates steps.
-        type: integer
-        default: -1
-      - name: output_dir
-        pass-as: --output_dir={v}
-        type: string
-        default: /valohai/outputs
-      - name: evaluate_during_training
-        description: Run evaluation during training at each logging step.
-        type: flag
-        default: true
-      - name: do_lower_case
-        description: Set this flag if you are using an uncased model.
-        type: flag