latex-ocr

Build error

App Files Files Community

cxeep

yhshin commited on Feb 27, 2023

Commit

5c0c2df

0 Parent(s):

Duplicate from yhshin/latex-ocr

Browse files

Co-authored-by: Young Ho Shin <[email protected]>

Files changed (20) hide show

.gitattributes +27 -0
README.md +14 -0
app.py +74 -0
article.md +74 -0
examples/1d32874f02.png +0 -0
examples/1e466b180d.png +0 -0
examples/2d3503f427.png +0 -0
examples/2f9d3c4e43.png +0 -0
examples/51c5cc2ff5.png +0 -0
examples/545a492388.png +0 -0
examples/6a51a30502.png +0 -0
examples/6bf6832adb.png +0 -0
examples/7afdeff0e6.png +0 -0
examples/b8f1e64b1f.png +0 -0
model/config.json +168 -0
model/pytorch_model.bin +3 -0
model/tokenizer-wordlevel.json +352 -0
requirements.txt +4 -0
tokenizer-wordlevel.json +352 -0
version-history.md +6 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Latex Ocr
+emoji: 👀
+colorFrom: red
+colorTo: indigo
+sdk: gradio
+sdk_version: 2.9.4
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: yhshin/latex-ocr
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import requests
+from PIL import Image
+url = 'https://huggingface.co/yhshin/latex-ocr/raw/main/tokenizer-wordlevel.json'
+r = requests.get(url)
+open('tokenizer-wordlevel.json' , 'wb').write(r.content)
+processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-printed")
+model = VisionEncoderDecoderModel.from_pretrained("yhshin/latex-ocr")
+from tokenizers import Tokenizer
+tokenizer = Tokenizer.from_file("tokenizer-wordlevel.json")
+# load image examples
+def process_image(image):
+    # prepare image
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    # generate (no beam search)
+    generated_ids = model.generate(pixel_values)
+    # decode
+    generated_text = tokenizer.decode_batch(generated_ids.tolist(), skip_special_tokens=True)[0]
+    # Strip spaces
+    generated_text = generated_text.replace(" ", "")
+    return generated_text
+# !ls examples | grep png
+# +
+title = "Convert image to LaTeX source code"
+with open('article.md',mode='r') as file:
+    article = file.read()
+description = """
+This is a demo of machine learning model trained to reconstruct the LaTeX source code of an equation from an image.
+To use it, simply upload an image or use one of the example images below and click 'submit'.
+Results will show up in a few seconds.
+Try rendering the generated LaTeX [here](https://quicklatex.com/) to compare with the original.
+(The model is not perfect yet, so you may need to edit the resulting LaTeX a bit to get it to render a good match.)
+"""
+examples = [
+    [ "examples/1d32874f02.png" ],
+    [ "examples/1e466b180d.png" ],
+    [ "examples/2d3503f427.png" ],
+    [ "examples/2f9d3c4e43.png" ],
+    [ "examples/51c5cc2ff5.png" ],
+    [ "examples/545a492388.png" ],
+    [ "examples/6a51a30502.png" ],
+    [ "examples/6bf6832adb.png" ],
+    [ "examples/7afdeff0e6.png" ],
+    [ "examples/b8f1e64b1f.png" ],
+]
+# -
+iface = gr.Interface(fn=process_image,
+                     inputs=[gr.inputs.Image(type="pil")],
+                     outputs=gr.outputs.Textbox(),
+                     title=title,
+                     description=description,
+                     article=article,
+                     examples=examples)
+iface.launch()

article.md ADDED Viewed

	@@ -0,0 +1,74 @@

+## What's the point of this?
+LaTeX is the de-facto standard markup language for typesetting pretty equations in academic papers.
+It is extremely feature rich and flexible but very verbose.
+This makes it great for typesetting complex equations, but not very convenient for quick note-taking on the fly.
+For example, here's a short equation from [this page](https://en.wikipedia.org/wiki/Quantum_electrodynamics) on Wikipedia about Quantum Electrodynamics
+and the corresponding LaTeX code:
+![Example]( https://wikimedia.org/api/rest_v1/media/math/render/svg/6faab1adbb88a567a52e55b2012e836a011a0675 )
+```
+{\displaystyle {\mathcal {L}}={\bar {\psi }}(i\gamma ^{\mu }D_{\mu }-m)\psi -{\frac {1}{4}}F_{\mu \nu }F^{\mu \nu },}
+```
+This demo is a first step in solving this problem.
+Eventually, you'll be able to take a quick partial screenshot from a paper
+and a program built with this model will generate its corresponding LaTeX source code
+so that you can just copy/paste straight into your personal notes.
+No more endless googling obscure LaTeX syntax!
+## How does it work?
+Because this problem involves looking at an image and generating valid LaTeX code,
+the model needs to understand both Computer Vision (CV) and Natural Language Processing (NLP).
+There are some other projects that aim to solve the same problem with some very interesting models.
+These generally involve some kind of "encoder" that looks at the image and extracts/encodes the information about the equation from the image,
+and a "decoder" that takes that information and translates it into what is hopefully both valid and accurate LaTeX code.
+The "encode" part can be done using classic CNN architectures commonly used for CV tasks, or newer vision transformer architectures.
+The "decode" part can be done with LSTMs or transformer decoders, using attention mechanism to make sure the decoder understands long range dependencies, e.g. remembering to close a bracket that was opened a long sequence away.
+I chose to tackle this problem with transfer learning, using an existing OCR model and fine-tuning it for this task.
+The biggest reason for this is computing constraints -
+GPU hours are expensive so I wanted training to be reasonably fast, on the order of a couple of hours.
+There are some other benefits to this approach,
+e.g. the architecture is already proven to be robust.
+I chose [TrOCR](https://arxiv.org/abs/2109.10282), a model trained at Microsoft for text recognition tasks which uses transformer architecture for both the encoder and decoder.
+For the data, I used the `im2latex-100k` dataset, which includes a total of roughly 100k formulas and images.
+Some preprocessing steps were done by Harvard NLP for the [`im2markup` project](https://github.com/harvardnlp/im2markup).
+To limit the scope of the project and simplify the task, I limited training data to only look at equations containing 100 LaTeX tokens or less.
+This covers most single line equations, including fractions, subscripts, symbols, etc, but does not cover large multi line equations, some of which can have up to 500 LaTeX tokens.
+GPU training was done on a Kaggle GPU Kernel in roughly 3 hours.
+You can find the full training code on my Kaggle profile [here](https://www.kaggle.com/code/younghoshin/finetuning-trocr/notebook).
+## What's next?
+There's multiple improvements that I'm hoping to make to this project.
+### More robust prediction
+If you've tried the examples above (randomly sampled from the test set), you've noticed that the model predictions aren't quite perfect and the model occasionally misses, duplicates or mistakes tokens.
+More training on the existing data set could help with this.
+### More data
+There's a lot of LaTeX data available on the internet besides `im2latex-100k`, e.g. arXiv and Wikipedia.
+It's just waiting to be scraped and used for this project.
+This means a lot of hours of scraping, cleaning, and processing but having a more diverse set of input images could improve model accuracy significantly.
+### Faster and smaller model
+The model currently takes a few seconds to process a single image.
+I would love to improve performance so that it can run in one second or less, maybe even on mobile devices.
+This might be impossible with TrOCR which is a fairly large model, designed for use on GPUs.
+<p style='text-align: center'>Made by Young Ho Shin</p>
+<p style='text-align: center'>
+    <a href = "mailto: [email protected]">Email</a> |
+    <a href='https://www.github.com/yhshin11'>Github</a> |
+    <a href='https://www.linkedin.com/in/young-ho-shin/'>Linkedin</a>
+</p>

examples/1d32874f02.png ADDED Viewed

examples/1e466b180d.png ADDED Viewed

examples/2d3503f427.png ADDED Viewed

examples/2f9d3c4e43.png ADDED Viewed

examples/51c5cc2ff5.png ADDED Viewed

examples/545a492388.png ADDED Viewed

examples/6a51a30502.png ADDED Viewed

examples/6bf6832adb.png ADDED Viewed

examples/7afdeff0e6.png ADDED Viewed

examples/b8f1e64b1f.png ADDED Viewed

model/config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "_name_or_path": "microsoft/trocr-small-stage1",
+  "architectures": [
+    "VisionEncoderDecoderModel"
+  ],
+  "decoder": {
+    "_name_or_path": "",
+    "activation_dropout": 0.0,
+    "activation_function": "relu",
+    "add_cross_attention": true,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": 384,
+    "d_model": 256,
+    "decoder_attention_heads": 8,
+    "decoder_ffn_dim": 1024,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 6,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layernorm_embedding": true,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "trocr",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": true,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.2",
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_learned_position_embeddings": true,
+    "vocab_size": 64044
+  },
+  "decoder_start_token_id": 1,
+  "early_stopping": true,
+  "encoder": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 384,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 384,
+    "initializer_range": 0.02,
+    "intermediate_size": 1536,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "deit",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 6,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.2",
+    "use_bfloat16": false
+  },
+  "eos_token_id": 2,
+  "is_encoder_decoder": true,
+  "length_penalty": 2.0,
+  "max_length": 100,
+  "model_type": "vision-encoder-decoder",
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 3,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vocab_size": 200
+}

model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4a4c0a9d1da23b3f66f4e6af213f9d17945aa1fb9376f2ad59c03ec74995ba7
+size 246530221

model/tokenizer-wordlevel.json ADDED Viewed

	@@ -0,0 +1,352 @@

+{
+  "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 100,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 100
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "[UNK]": 0,
+      "[CLS]": 1,
+      "[SEP]": 2,
+      "[PAD]": 3,
+      "[MASK]": 4,
+      "}": 5,
+      "{": 6,
+      "\\": 7,
+      "_": 8,
+      "^": 9,
+      "(": 10,
+      ")": 11,
+      "2": 12,
+      "1": 13,
+      "-": 14,
+      "=": 15,
+      ",": 16,
+      "+": 17,
+      "frac": 18,
+      "i": 19,
+      "0": 20,
+      "x": 21,
+      "n": 22,
+      ".": 23,
+      "d": 24,
+      "\\,": 25,
+      "a": 26,
+      "mu": 27,
+      "left": 28,
+      "right": 29,
+      "e": 30,
+      "k": 31,
+      "c": 32,
+      "m": 33,
+      "r": 34,
+      "p": 35,
+      "3": 36,
+      "alpha": 37,
+      "t": 38,
+      "partial": 39,
+      "~": 40,
+      "l": 41,
+      "A": 42,
+      "s": 43,
+      "&": 44,
+      "4": 45,
+      "j": 46,
+      "\\;": 47,
+      "g": 48,
+      "prime": 49,
+      "]": 50,
+      "[": 51,
+      "nu": 52,
+      "z": 53,
+      "pi": 54,
+      "|": 55,
+      "b": 56,
+      "phi": 57,
+      "\\\\": 58,
+      "mathrm": 59,
+      "q": 60,
+      "operatorname": 61,
+      "cal": 62,
+      "N": 63,
+      "delta": 64,
+      "f": 65,
+      "lambda": 66,
+      "beta": 67,
+      "bar": 68,
+      "T": 69,
+      "int": 70,
+      "array": 71,
+      "R": 72,
+      "S": 73,
+      "D": 74,
+      "L": 75,
+      "M": 76,
+      "B": 77,
+      "y": 78,
+      "sigma": 79,
+      "F": 80,
+      "theta": 81,
+      "/": 82,
+      "gamma": 83,
+      "h": 84,
+      "hat": 85,
+      "psi": 86,
+      "sqrt": 87,
+      "sum": 88,
+      "u": 89,
+      "H": 90,
+      "o": 91,
+      "rho": 92,
+      "tilde": 93,
+      "tau": 94,
+      "C": 95,
+      "P": 96,
+      "G": 97,
+      "V": 98,
+      "I": 99,
+      "X": 100,
+      "omega": 101,
+      "epsilon": 102,
+      "E": 103,
+      "J": 104,
+      "bf": 105,
+      "eta": 106,
+      "v": 107,
+      "xi": 108,
+      "Q": 109,
+      "Phi": 110,
+      "quad": 111,
+      "*": 112,
+      "5": 113,
+      "\\{": 114,
+      "vec": 115,
+      "begin": 116,
+      "end": 117,
+      "Gamma": 118,
+      "K": 119,
+      "infty": 120,
+      "\\}": 121,
+      "6": 122,
+      "U": 123,
+      "rangle": 124,
+      "dot": 125,
+      "W": 126,
+      "pm": 127,
+      "Lambda": 128,
+      "Z": 129,
+      "varphi": 130,
+      "Delta": 131,
+      "w": 132,
+      "chi": 133,
+      ";": 134,
+      "8": 135,
+      "\\!": 136,
+      "Omega": 137,
+      "kappa": 138,
+      "qquad": 139,
+      "cdot": 140,
+      "Psi": 141,
+      "equiv": 142,
+      "langle": 143,
+      "overline": 144,
+      ">": 145,
+      "<": 146,
+      "dagger": 147,
+      "zeta": 148,
+      "varepsilon": 149,
+      "cdots": 150,
+      "rightarrow": 151,
+      "O": 152,
+      "nabla": 153,
+      "Y": 154,
+      "ldots": 155,
+      ":": 156,
+      "Sigma": 157,
+      "ell": 158,
+      "7": 159,
+      "mathcal": 160,
+      "\\:": 161,
+      "!": 162,
+      "otimes": 163,
+      "prod": 164,
+      "wedge": 165,
+      "9": 166,
+      "hspace": 167,
+      "Pi": 168,
+      "hbar": 169,
+      "sim": 170,
+      "vert": 171,
+      "in": 172,
+      "Big": 173,
+      "widetilde": 174,
+      "displaystyle": 175,
+      "times": 176,
+      "Theta": 177,
+      "underline": 178,
+      "mid": 179,
+      "to": 180,
+      "dots": 181,
+      "mathbf": 182,
+      "ast": 183,
+      "leq": 184,
+      "approx": 185,
+      "star": 186,
+      "stackrel": 187,
+      "perp": 188,
+      "widehat": 189,
+      "big": 190,
+      "vartheta": 191,
+      "'": 192,
+      "Bigr": 193,
+      "geq": 194,
+      "mp": 195,
+      "Bigl": 196,
+      "dag": 197,
+      "neq": 198,
+      "simeq": 199
+    },
+    "unk_token": "[UNK]"
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch==1.9.0
+transformers==4.18.0
+sentencepiece==0.1.96

tokenizer-wordlevel.json ADDED Viewed

	@@ -0,0 +1,352 @@

+{
+  "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 100,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 100
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "[UNK]": 0,
+      "[CLS]": 1,
+      "[SEP]": 2,
+      "[PAD]": 3,
+      "[MASK]": 4,
+      "}": 5,
+      "{": 6,
+      "\\": 7,
+      "_": 8,
+      "^": 9,
+      "(": 10,
+      ")": 11,
+      "2": 12,
+      "1": 13,
+      "-": 14,
+      "=": 15,
+      ",": 16,
+      "+": 17,
+      "frac": 18,
+      "i": 19,
+      "0": 20,
+      "x": 21,
+      "n": 22,
+      ".": 23,
+      "d": 24,
+      "\\,": 25,
+      "a": 26,
+      "mu": 27,
+      "left": 28,
+      "right": 29,
+      "e": 30,
+      "k": 31,
+      "c": 32,
+      "m": 33,
+      "r": 34,
+      "p": 35,
+      "3": 36,
+      "alpha": 37,
+      "t": 38,
+      "partial": 39,
+      "~": 40,
+      "l": 41,
+      "A": 42,
+      "s": 43,
+      "&": 44,
+      "4": 45,
+      "j": 46,
+      "\\;": 47,
+      "g": 48,
+      "prime": 49,
+      "]": 50,
+      "[": 51,
+      "nu": 52,
+      "z": 53,
+      "pi": 54,
+      "|": 55,
+      "b": 56,
+      "phi": 57,
+      "\\\\": 58,
+      "mathrm": 59,
+      "q": 60,
+      "operatorname": 61,
+      "cal": 62,
+      "N": 63,
+      "delta": 64,
+      "f": 65,
+      "lambda": 66,
+      "beta": 67,
+      "bar": 68,
+      "T": 69,
+      "int": 70,
+      "array": 71,
+      "R": 72,
+      "S": 73,
+      "D": 74,
+      "L": 75,
+      "M": 76,
+      "B": 77,
+      "y": 78,
+      "sigma": 79,
+      "F": 80,
+      "theta": 81,
+      "/": 82,
+      "gamma": 83,
+      "h": 84,
+      "hat": 85,
+      "psi": 86,
+      "sqrt": 87,
+      "sum": 88,
+      "u": 89,
+      "H": 90,
+      "o": 91,
+      "rho": 92,
+      "tilde": 93,
+      "tau": 94,
+      "C": 95,
+      "P": 96,
+      "G": 97,
+      "V": 98,
+      "I": 99,
+      "X": 100,
+      "omega": 101,
+      "epsilon": 102,
+      "E": 103,
+      "J": 104,
+      "bf": 105,
+      "eta": 106,
+      "v": 107,
+      "xi": 108,
+      "Q": 109,
+      "Phi": 110,
+      "quad": 111,
+      "*": 112,
+      "5": 113,
+      "\\{": 114,
+      "vec": 115,
+      "begin": 116,
+      "end": 117,
+      "Gamma": 118,
+      "K": 119,
+      "infty": 120,
+      "\\}": 121,
+      "6": 122,
+      "U": 123,
+      "rangle": 124,
+      "dot": 125,
+      "W": 126,
+      "pm": 127,
+      "Lambda": 128,
+      "Z": 129,
+      "varphi": 130,
+      "Delta": 131,
+      "w": 132,
+      "chi": 133,
+      ";": 134,
+      "8": 135,
+      "\\!": 136,
+      "Omega": 137,
+      "kappa": 138,
+      "qquad": 139,
+      "cdot": 140,
+      "Psi": 141,
+      "equiv": 142,
+      "langle": 143,
+      "overline": 144,
+      ">": 145,
+      "<": 146,
+      "dagger": 147,
+      "zeta": 148,
+      "varepsilon": 149,
+      "cdots": 150,
+      "rightarrow": 151,
+      "O": 152,
+      "nabla": 153,
+      "Y": 154,
+      "ldots": 155,
+      ":": 156,
+      "Sigma": 157,
+      "ell": 158,
+      "7": 159,
+      "mathcal": 160,
+      "\\:": 161,
+      "!": 162,
+      "otimes": 163,
+      "prod": 164,
+      "wedge": 165,
+      "9": 166,
+      "hspace": 167,
+      "Pi": 168,
+      "hbar": 169,
+      "sim": 170,
+      "vert": 171,
+      "in": 172,
+      "Big": 173,
+      "widetilde": 174,
+      "displaystyle": 175,
+      "times": 176,
+      "Theta": 177,
+      "underline": 178,
+      "mid": 179,
+      "to": 180,
+      "dots": 181,
+      "mathbf": 182,
+      "ast": 183,
+      "leq": 184,
+      "approx": 185,
+      "star": 186,
+      "stackrel": 187,
+      "perp": 188,
+      "widehat": 189,
+      "big": 190,
+      "vartheta": 191,
+      "'": 192,
+      "Bigr": 193,
+      "geq": 194,
+      "mp": 195,
+      "Bigl": 196,
+      "dag": 197,
+      "neq": 198,
+      "simeq": 199
+    },
+    "unk_token": "[UNK]"
+  }
+}

version-history.md ADDED Viewed

	@@ -0,0 +1,6 @@

+| Version | # epochs | max # tokens | vocab size | notebook and training log                                                                          | Comments |
+|---------|----------|--------------|------------|----------------------------------------------------------------------------------------------------|----------|
+| v4      | 10       | 100          | 200        | [link](https://www.kaggle.com/code/younghoshin/finetuning-trocr/notebook?scriptVersionId=94172330) |          |
+|         |          |              |            |                                                                                                    |          |
+|         |          |              |            |                                                                                                    |          |
+|         |          |              |            |                                                                                                    |          |