Spaces:

duck-systems
/

TinyLlama-v1.1-GAD-Demo

Sleeping

App Files Files Community

kjcjohnson commited on Nov 13, 2024

Commit

901bbd9

1 Parent(s): 9fbf7f9

Add GAD libraries

Browse files

Files changed (30) hide show

transformers_gad/__init__.py +5 -0
transformers_gad/__pycache__/__init__.cpython-311.pyc +0 -0
transformers_gad/__pycache__/grammar_utils.cpython-311.pyc +0 -0
transformers_gad/__pycache__/logging_config.cpython-311.pyc +0 -0
transformers_gad/__pycache__/mapping.cpython-311.pyc +0 -0
transformers_gad/__pycache__/parser.cpython-311.pyc +0 -0
transformers_gad/__pycache__/recognizer.cpython-311.pyc +0 -0
transformers_gad/__pycache__/token_grammar_recognizer.cpython-311.pyc +0 -0
transformers_gad/__pycache__/trie.cpython-311.pyc +0 -0
transformers_gad/__pycache__/utf8_utils.cpython-311.pyc +0 -0
transformers_gad/__pycache__/utils.cpython-311.pyc +0 -0
transformers_gad/__pycache__/vocab_struct.cpython-311.pyc +0 -0
transformers_gad/generation/__init__.py +1 -0
transformers_gad/generation/__pycache__/__init__.cpython-311.pyc +0 -0
transformers_gad/generation/__pycache__/logits_process.cpython-311.pyc +0 -0
transformers_gad/generation/logits_process.py +348 -0
transformers_gad/grammar_utils.py +4 -0
transformers_gad/logging_config.py +18 -0
transformers_gad/mapping.py +209 -0
transformers_gad/oracle/__init_.py +1 -0
transformers_gad/oracle/__pycache__/oracle_trie.cpython-311.pyc +0 -0
transformers_gad/oracle/oracle_trie.py +261 -0
transformers_gad/parser.py +576 -0
transformers_gad/parser_cfg.py +530 -0
transformers_gad/recognizer.py +456 -0
transformers_gad/token_grammar_recognizer.py +322 -0
transformers_gad/trie.py +194 -0
transformers_gad/utf8_utils.py +170 -0
transformers_gad/utils.py +98 -0
transformers_gad/vocab_struct.py +83 -0

transformers_gad/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .logging_config import setup_logging
+setup_logging()
+__version__ = "0.1.2"

transformers_gad/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (318 Bytes). View file

transformers_gad/__pycache__/grammar_utils.cpython-311.pyc ADDED Viewed

Binary file (330 Bytes). View file

transformers_gad/__pycache__/logging_config.cpython-311.pyc ADDED Viewed

Binary file (964 Bytes). View file

transformers_gad/__pycache__/mapping.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

transformers_gad/__pycache__/parser.cpython-311.pyc ADDED Viewed

Binary file (24.1 kB). View file

transformers_gad/__pycache__/recognizer.cpython-311.pyc ADDED Viewed

Binary file (20.2 kB). View file

transformers_gad/__pycache__/token_grammar_recognizer.cpython-311.pyc ADDED Viewed

Binary file (16.3 kB). View file

transformers_gad/__pycache__/trie.cpython-311.pyc ADDED Viewed

Binary file (8.37 kB). View file

transformers_gad/__pycache__/utf8_utils.cpython-311.pyc ADDED Viewed

Binary file (6.17 kB). View file

transformers_gad/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.66 kB). View file

transformers_gad/__pycache__/vocab_struct.cpython-311.pyc ADDED Viewed

Binary file (4.44 kB). View file

transformers_gad/generation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .logits_process import GrammarConstrainedLogitsProcessor, GrammarAlignedOracleLogitsProcessor

transformers_gad/generation/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (343 Bytes). View file

transformers_gad/generation/__pycache__/logits_process.cpython-311.pyc ADDED Viewed

Binary file (17.2 kB). View file

transformers_gad/generation/logits_process.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import copy
+import math
+import torch.nn.functional as F
+import torch
+import logging
+from transformers.generation.logits_process import (
+    LogitsProcessor,
+    LOGITS_PROCESSOR_INPUTS_DOCSTRING,
+)
+from transformers.utils import add_start_docstrings
+from transformers_gad.grammar_utils import IncrementalGrammarConstraint
+from transformers_gad.oracle.oracle_trie import Trie
+class GrammarConstrainedLogitsProcessor(LogitsProcessor):
+    def __init__(self, grammar_constraint, parse_start_index=None, save_log=False):
+        # Parser variables
+        self.grammar_constraint = grammar_constraint
+        self.batch_parsing_states = None
+        self.parse_start_index = parse_start_index
+        # To start with a longer prefix in enumerative search
+        self.generate_start_index = None
+        self.generated_tokens = None
+        # Generation Log
+        self.save_log = save_log
+        self.history = []
+    def reset(self):
+        self.reset_parser()
+        self.reset_history()
+    def reset_parser(self):
+        self.batch_parsing_states = None
+        if self.grammar_constraint.is_incremental:
+            self.grammar_constraint.reset()
+        self.generate_start_index = None
+        self.generated_tokens = None
+    def reset_history(self):
+        self.history = []
+    def mask_scores(self, scores, device):
+        """
+        resolve each stack to a tensor of True/False for each token
+        indicating acceptance
+        """
+        masked_scores = scores.clone()
+        acceptance = self.grammar_constraint.batch_filter_vocab(
+            self.batch_parsing_states, device
+        )
+        if self.save_log:
+            self.store_detailed_history(acceptance, scores)
+        # Scores to -inf where False
+        masked_scores[~acceptance] = -math.inf
+        return masked_scores
+    def process_scores(self, input_ids, scores):
+        # we dynamically create stacks at the first call, so that we know the batch size and beam size
+        if self.batch_parsing_states is None:
+            self.batch_parsing_states = [
+                copy.deepcopy(
+                    self.grammar_constraint.string_recognizer.get_initial_accept_state()
+                )
+                for _ in range(len(input_ids))
+            ]
+        # assume the generation starts from the same index
+        if self.generate_start_index is None:
+            # the default is the end of input sequence of tokens
+            self.generate_start_index = self.parse_start_index \
+                if self.parse_start_index else input_ids.size(1)
+        self.generated_tokens = input_ids[:, self.generate_start_index:]
+        # Advance parser states
+        self.batch_parsing_states = self.grammar_constraint.advance_token_ids(
+            input_ids, self.batch_parsing_states, self.parse_start_index
+        )
+        masked_scores = self.mask_scores(scores, scores.device)
+        return masked_scores
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        return self.process_scores(input_ids, scores)
+    def reset_parser(self):
+        self.batch_parsing_states = None
+        if isinstance(self.grammar_constraint, IncrementalGrammarConstraint):
+            self.grammar_constraint.reset()
+    def get_accepted_tokens(self, acceptance):
+        """
+        Get the indices of accepted tokens and their corresponding string values for each item in the batch.
+        Parameters:
+        - acceptance (torch.Tensor): A boolean tensor indicating accepted tokens for each item in the batch.
+        """
+        batch_size, _ = acceptance.shape
+        acceptance_np = acceptance.cpu().numpy()
+        accepted_x, accepted_y = acceptance_np.nonzero()
+        # Initialize the dictionary with empty lists for indices
+        accepted_token_indices = {i: [] for i in range(batch_size)}
+        for x, y in zip(accepted_x, accepted_y):
+            accepted_token_indices[x].append(y)
+        # Convert token IDs to tokens
+        accepted_tokens = {
+            i: [self.grammar_constraint.tokenizer.decode([token_id]) for token_id in token_ids]
+            for i, token_ids in accepted_token_indices.items()
+        }
+        return accepted_tokens
+    def store_detailed_history(self, acceptance, scores):
+        """
+        Processes and stores information for accepted tokens including their IDs, tokens,
+        raw scores, and logits.
+        Parameters:
+        - acceptance (torch.Tensor): A boolean tensor indicating accepted tokens for each item in the batch.
+        - scores (torch.Tensor): The raw scores from the model output.
+        - adjusted_scores (torch.Tensor): The adjusted scores after applying expected future grammaticality.
+        """
+        likelihoods = F.softmax(scores, dim=-1)
+        # Initializing the list to store detailed information for each step
+        batch_accepted_info = []
+        for batch_index in range(acceptance.size(0)):  # Iterate over batch items
+            accepted_info = []
+            accepted_indices = acceptance[batch_index].nonzero().squeeze(-1)
+            for idx in accepted_indices:
+                token_id = idx.item()
+                raw_score = scores[batch_index, idx].item()
+                likelihood = likelihoods[batch_index, idx].item()
+                token = self.grammar_constraint.tokenizer.decode([token_id])
+                # Store detailed information as a dictionary
+                accepted_info.append({
+                    "token_id": token_id,
+                    "token": str(token),
+                    "raw_score": raw_score,
+                    "raw_likelihood": likelihood
+                })
+            batch_accepted_info.append(accepted_info)
+        # Store this detailed information in the history
+        self.history.append(batch_accepted_info)
+class GrammarAlignedOracleLogitsProcessor(LogitsProcessor):
+    def __init__(self, grammar_constraint, oracle_trie=Trie(), parse_start_index=None, save_log=False):
+        # Parser variables
+        self.grammar_constraint = grammar_constraint
+        self.batch_parsing_states = None
+        self.parse_start_index = parse_start_index
+        # ASAp oracle trie
+        self.oracle_trie = oracle_trie
+        # To start with a longer prefix in enumerative search
+        self.generate_start_index = None
+        self.generated_tokens = None
+        # Generation Log
+        self.save_log = save_log
+        self.history = []
+    def adjust_scores(self, scores, device):
+        """
+        resolve each stack to a tensor of True/False for each token
+        indicating acceptance
+        """
+        acceptance = self.grammar_constraint.batch_filter_vocab(
+            self.batch_parsing_states, device
+        )
+        current_parent = self.oracle_trie.search_last_parent(self.generated_tokens)
+        current_parent.insert_accepted_tokens(scores, acceptance)
+        adjusted_scores = self.apply_oracle_adjustments(acceptance, scores, current_parent)
+        if self.save_log:
+            self.store_detailed_history(acceptance, scores, adjusted_scores)
+        # Scores to -inf where False
+        adjusted_scores[~acceptance] = -math.inf
+        return adjusted_scores
+    def apply_oracle_adjustments(self, acceptance, scores, current_parent):
+        """
+        Multiply expected future grammarticality
+        Use the normalized (and unmasked) probabiltiy
+        Parameters:
+        - acceptance (torch.Tensor): A characteristic vector of valid tokens
+                                     used to updated only valid tokens
+        - scores (torch.Tensor): Unnormalized logits from language model
+        - current_parent (TrieNode): The trie node for the current prefix
+        """
+        adjusted_scores = scores.clone()
+        likelihoods = F.softmax(adjusted_scores, dim=-1)
+        log_likelihoods = torch.log(likelihoods)
+        for batch_index in range(acceptance.size(0)):
+            accepted_indices = acceptance[batch_index].nonzero().squeeze(-1)
+            for idx in accepted_indices:
+                token_id = idx.item()
+                log_likelihood = log_likelihoods[batch_index, idx].item()
+                # Get theta (log of expected future grammaticality) for this specific token
+                success_rate = current_parent.get_success_rate(token_id)
+                if not isinstance(success_rate, torch.Tensor):
+                    success_rate = torch.tensor(success_rate, dtype=torch.float)
+                log_theta = torch.log(success_rate)
+                # Calculate adjusted score
+                adjusted_score = log_likelihood + log_theta
+                adjusted_scores[batch_index, idx] = adjusted_score
+        return adjusted_scores
+    def process_scores(self, input_ids, scores):
+        # we dynamically create stacks at the first call, so that we know the batch size and beam size
+        if self.batch_parsing_states is None:
+            self.batch_parsing_states = [
+                copy.deepcopy(
+                    self.grammar_constraint.string_recognizer.get_initial_accept_state()
+                )
+                for _ in range(len(input_ids))
+            ]
+        # assume the generation starts from the same index
+        if self.generate_start_index is None:
+            # the default is the end of input sequence of tokens
+            self.generate_start_index = self.parse_start_index \
+                if self.parse_start_index else input_ids.size(1)
+        self.generated_tokens = input_ids[:, self.generate_start_index:]
+        # Advance parser states
+        self.batch_parsing_states = self.grammar_constraint.advance_token_ids(
+            input_ids, self.batch_parsing_states, self.parse_start_index
+        )
+        adjusted_scores = self.adjust_scores(scores, scores.device)
+        return adjusted_scores
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        return self.process_scores(input_ids, scores)
+    def reset(self):
+        self.reset_parser()
+        self.reset_history()
+    def reset_parser(self):
+        self.batch_parsing_states = None
+        if self.grammar_constraint.is_incremental:
+            self.grammar_constraint.reset()
+        self.generate_start_index = None
+        self.generated_tokens = None
+    def reset_history(self):
+        self.history = []
+    def reset_trie(self):
+        self.oracle_trie = Trie()
+    def get_accepted_tokens(self, acceptance):
+        """
+        Get the indices of accepted tokens and their corresponding string values for each item in the batch.
+        Parameters:
+        - acceptance (torch.Tensor): A boolean tensor indicating accepted tokens for each item in the batch.
+        """
+        batch_size, _ = acceptance.shape
+        acceptance_np = acceptance.cpu().numpy()
+        accepted_x, accepted_y = acceptance_np.nonzero()
+        # Initialize the dictionary with empty lists for indices
+        accepted_token_indices = {i: [] for i in range(batch_size)}
+        for x, y in zip(accepted_x, accepted_y):
+            accepted_token_indices[x].append(y)
+        # Convert token IDs to tokens
+        accepted_tokens = {
+            i: [self.grammar_constraint.tokenizer.decode([token_id]) for token_id in token_ids]
+            for i, token_ids in accepted_token_indices.items()
+        }
+        return accepted_tokens
+    def store_detailed_history(self, acceptance, scores, adjusted_scores):
+        """
+        Processes and stores information for accepted tokens including their IDs, tokens,
+        raw scores, and logits.
+        Parameters:
+        - acceptance (torch.Tensor): A boolean tensor indicating accepted tokens for each item in the batch.
+        - scores (torch.Tensor): The raw scores from the model output.
+        - adjusted_scores (torch.Tensor): The adjusted scores after applying expected future grammaticality.
+        """
+        likelihoods = F.softmax(scores, dim=-1)
+        adjusted_likelihoods = F.softmax(adjusted_scores, dim=-1)
+        # Initializing the list to store detailed information for each step
+        batch_accepted_info = []
+        for batch_index in range(acceptance.size(0)):  # Iterate over batch items
+            accepted_info = []
+            accepted_indices = acceptance[batch_index].nonzero().squeeze(-1)
+            for idx in accepted_indices:
+                token_id = idx.item()
+                raw_score = scores[batch_index, idx].item()
+                likelihood = likelihoods[batch_index, idx].item()
+                adjusted_likelihood = adjusted_likelihoods[batch_index, idx].item()
+                token = self.grammar_constraint.tokenizer.decode([token_id])
+                # Store detailed information as a dictionary
+                accepted_info.append({
+                    "token_id": token_id,
+                    "token": str(token),
+                    "raw_score": raw_score,
+                    "raw_likelihood": likelihood,
+                    "adjusted_likelihood": adjusted_likelihood
+                })
+            batch_accepted_info.append(accepted_info)
+        # Store this detailed information in the history
+        self.history.append(batch_accepted_info)

transformers_gad/grammar_utils.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .token_grammar_recognizer import IncrementalTokenRecognizer
+# Old class name, kept for backward compatibility
+IncrementalGrammarConstraint = IncrementalTokenRecognizer

transformers_gad/logging_config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# logging_config.py
+import os
+import logging
+def setup_logging():
+    log_level_name = os.getenv(
+        "TCFG_LOG_LEVEL", "WARNING"
+    ).upper()  # Default to WARNING if not set
+    log_levels = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+    log_level = log_levels.get(log_level_name, logging.WARNING)
+    logging.basicConfig(level=log_level)

transformers_gad/mapping.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from typing import Dict, List
+from transformers_gad.utils import get_tokenizer_model_type, ints2bytes
+from transformers import AutoTokenizer
+import logging
+log = logging.getLogger(__name__)
+def get_mapping(tokenizer, unicode=False):
+    log.debug(f"tokenizer type: {tokenizer.__class__.__name__}")
+    log.debug(f"tokenizer model type: {get_tokenizer_model_type(tokenizer)}")
+    if not unicode:
+        if (
+            "gpt2" in tokenizer.__class__.__name__.lower()
+            or "bloom" in tokenizer.__class__.__name__.lower()
+            or "pretrainedtokenizer" in tokenizer.__class__.__name__.lower()
+            or "codegen" in tokenizer.__class__.__name__.lower()
+            or "gptneox" in tokenizer.__class__.__name__.lower()
+        ):
+            return BBPEMapping(tokenizer)
+        elif "t5" in tokenizer.__class__.__name__.lower():
+            return BPEMapping(tokenizer)
+        elif "llama" in tokenizer.__class__.__name__.lower():
+            return LlamaBPEMapping(tokenizer)
+        elif "xglm" in tokenizer.__class__.__name__.lower():
+            return UniGramMapping(tokenizer)
+        else:
+            raise ValueError(f"Unknown tokenizer type: {tokenizer.__class__.__name__}")
+    else:
+        if "gpt2" in tokenizer.__class__.__name__.lower():
+            return UnicodeBBPEMapping(tokenizer)
+        else:
+            raise NotImplementedError(
+                f"Unicode mapping for {tokenizer.__class__.__name__}"
+            )
+class Mapping:
+    def __init__(self, tokenizer):
+        self.eos_token_id = tokenizer.eos_token_id
+        self.bos_token_id = tokenizer.bos_token_id
+        self.tokenizer = tokenizer
+        self.special = tokenizer.all_special_ids
+    def __len__(self):
+        return len(self.tokenizer.get_vocab())
+    def _map(self, token_id: int) -> str:
+        # This is the case for BOS,
+        if token_id in self.special:
+            return ""
+        # if token_id is tensor, convert it to int
+        if hasattr(token_id, "item"):
+            token_id = token_id.item()
+        raw_token = self.tokenizer.convert_ids_to_tokens(token_id)
+        return raw_token
+    def map(self, token_id: int, verbose=False) -> bytes:
+        token = self._map(token_id)
+        if verbose:
+            log.debug(f"token_id: {token_id}, token: {token}")
+        return bytes(token, "utf-8")
+class BBPEMapping(Mapping):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _map(self, token_id: int) -> str:
+        raw_token = super()._map(token_id)
+        if raw_token.startswith("Ġ"):
+            raw_token = raw_token.replace("Ġ", " ")
+        return raw_token
+class UnicodeBBPEMapping(Mapping):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.intermediate_encoding = UnicodeBBPEMapping.get_intermediate_encoding(
+            self.tokenizer
+        )
+    def _map(self, token_id: int, verbose=False) -> str:
+        raw_token = super()._map(token_id)
+        # if raw_token.startswith("Ġ"):
+        #     raw_token = raw_token.replace("Ġ", " ")
+        return raw_token
+    def map(self, token_id: int, verbose=False) -> bytes:
+        raw_token = self._map(token_id, verbose)
+        if verbose:
+            log.debug(f"token_id: {token_id}, raw_token: {raw_token}")
+        return self.intermediate_encoding.token2bytes(raw_token)
+    @staticmethod
+    def get_intermediate_encoding(tokenizer):
+        if "gpt2" in tokenizer.__class__.__name__.lower():
+            return ByteEncoding(tokenizer)
+        else:
+            return None
+class BPEMapping(Mapping):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.last_token_id = None
+    def _map(self, token_id: int) -> str:
+        raw_token = super()._map(token_id)
+        # we need to check if the token is at the beginning of the sentence to remove the space
+        # specific to BPE
+        at_bos = False
+        if self.last_token_id is not None and self.last_token_id == self.bos_token_id:
+            at_bos = True
+        self.last_token_id = token_id
+        if raw_token.startswith("▁"):
+            raw_token = raw_token.replace("▁", " ")
+            if at_bos:
+                # remove space at the beginning of the sentence
+                raw_token = raw_token[1:]
+        return raw_token
+class LlamaBPEMapping(BPEMapping):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+    def _map(self, token_id: int) -> str:
+        raw_token = super()._map(token_id)
+        # if the token is hex, token is a string like "<0x00>"
+        # first 256 tokens are hex
+        if raw_token.startswith("<0x"):
+            hex_value = raw_token[4:-1]
+            raw_token = chr(int(hex_value, 16))
+        return raw_token
+class WordPieceMapping(Mapping):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+    def map(self, token_id: int) -> bytes:
+        if token_id in self.special:
+            return bytes()
+        return bytes(
+            self.tokenizer.decode([token_id], clean_up_tokenization_spaces=False),
+            "utf-8",
+        )
+class UniGramMapping(Mapping):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+    def map(self, token_id: int) -> bytes:
+        if token_id in self.special:
+            return bytes()
+        return bytes(
+            self.tokenizer.decode([token_id], clean_up_tokenization_spaces=False),
+            "utf-8",
+        )
+class XGLMUniGramMapping(Mapping):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.bos_token_id = tokenizer.eos_token_id
+        self.eos_token_id = None
+class ByteEncoding:
+    def __init__(self, tokenizer):
+        # check if the tokenizer is fast, if so, convert it to slow
+        if tokenizer.is_fast:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer.name_or_path, use_fast=False
+            )
+        self.tokenizer = tokenizer
+        self.byte2char: Dict[int, str] = tokenizer.byte_encoder
+        self.char2byte: Dict[str, int] = tokenizer.byte_decoder
+        # code point to byte
+        self.cdp2byte: Dict[int, int] = {ord(c): b for c, b in self.char2byte.items()}
+        self.byte2cdp: Dict[int, int] = {v: k for k, v in self.cdp2byte.items()}
+    def map(self, byte: int) -> int:
+        assert 0 <= byte < 256, f"byte: {byte} is not in the range [0, 256)"
+        return ord(self.byte2char[byte])
+    def token_ids2bytes(self, token_ids: List[int]) -> bytes:
+        tokens: List[str] = self.tokenizer.convert_ids_to_tokens(token_ids)
+        # for token id = BOS, the token should be empty string instead of <s>
+        # TODO, this may cause issues because this means that special tokens like BOS can appear at any position
+        tokens = [
+            "" if token in self.tokenizer.all_special_ids else token for token in tokens
+        ]
+        bytes: List[List[int]] = [self.token2bytes(token) for token in tokens]
+        # join the bytes
+        return ints2bytes(sum(bytes, []))
+    def token_id2bytes(self, token_id: int) -> bytes:
+        token: str = self.tokenizer.convert_ids_to_tokens(token_id)
+        return self.token2bytes(token)
+    def token2bytes(self, token: str) -> bytes:
+        # import pdb; pdb.set_trace()
+        bytes_seq: List[int] = [self.char2byte[c] for c in token]
+        return bytes(bytes_seq)

transformers_gad/oracle/__init_.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .oracle_trie import Trie, TrieNode, update_oracle_trie

transformers_gad/oracle/__pycache__/oracle_trie.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

transformers_gad/oracle/oracle_trie.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import torch
+import torch.nn.functional as F
+import json
+import logging
+class TrieNode:
+    def __init__(self,
+                 token_id=None, raw_likelihood=None, raw_score=None,
+                 success_rate=1,
+                 is_start_of_sequence=False, is_end_of_sequence=False,
+                 eos_token_id=2):
+        self.children = {}
+        self.parent = None
+        self.token_id = token_id
+        self.raw_likelihood = raw_likelihood
+        self.raw_score = raw_score
+        # The default approximation of EFG
+        self.success_rate = success_rate
+        self.eos_token_id = eos_token_id
+        self.is_start_of_sequence = is_start_of_sequence
+        self.is_end_of_sequence = is_end_of_sequence
+    def insert(self, child_node):
+        """
+        Insert child_node into the children dictionary
+        """
+        if child_node.token_id not in self.children:
+            self.children[child_node.token_id] = child_node
+            child_node.parent = self
+            if child_node.token_id == self.eos_token_id:
+                child_node.is_end_of_sequence = True
+            # update the success rate of the parent node
+            return self.update_success_rate()
+        else:
+            return 0
+    def insert_accepted_tokens(self, scores, acceptance):
+        """
+        Create node from acceptance and scores and
+        insert as children of self node
+        """
+        likelihoods = F.softmax(scores, dim=-1)
+        for batch_index in range(acceptance.size(0)):
+            accepted_tokens = acceptance[batch_index].nonzero().squeeze(-1)
+            for token_id in accepted_tokens:
+                if token_id not in self.children:
+                    raw_likelihood = likelihoods[batch_index, token_id].item()
+                    raw_score = scores[batch_index, token_id].item()
+                    child_node = TrieNode(
+                        token_id=token_id.item(),
+                        raw_likelihood=raw_likelihood,
+                        raw_score=raw_score)
+                    self.insert(child_node)
+    def get_success_rate(self, token_id):
+        """
+        Return Approximated Expected Future Grammaticality of the token_id
+        """
+        if token_id in self.children:
+            return self.children[token_id].success_rate
+        else:
+            return 1
+    def update_success_rate(self):
+        """
+        Re-compute the success rate from the updated success rate of children
+        """
+        if self.children:
+            total_success_rate = sum(child.raw_likelihood * child.success_rate for child in self.children.values())
+            # Get how much of unexplored nodes are covered with this update
+            updated_rate = self.success_rate - total_success_rate
+            self.success_rate = total_success_rate
+            # Back propagate the success rate
+            if self.parent:
+                return self.parent.update_success_rate()
+            return updated_rate
+    def prefix_raw_likelihood(self):
+        if self.parent:
+            return self.raw_likelihood * self.parent.prefix_raw_likelihood()
+        else:
+            return self.raw_likelihood
+    def search_token(self, token_id):
+        """
+        Check if the self node has a children with token_id
+        Return the children node if it exists, return None otherwise
+        """
+        if token_id in self.children:
+            return self.children[token_id]
+        else:
+            return None
+    def to_dict(self):
+        """
+        Convert a trie into a dictionary by removing the pointer to the parent
+        """
+        return {
+            "token_id": self.token_id,
+            "raw_likelihood": self.raw_likelihood,
+            "raw_score": self.raw_score,
+            "success_rate": self.success_rate,
+            "eos_token_id": self.eos_token_id,
+            "is_start_of_sequence": self.is_start_of_sequence,
+            "is_end_of_sequence": self.is_end_of_sequence,
+            "children": [child.to_dict() for child in self.children.values()]
+        }
+    @staticmethod
+    def from_dict(d):
+        """
+        Recursively (re)construct trie from dictionary
+        """
+        node = TrieNode(
+                 token_id=d['token_id'],
+                 raw_likelihood=d['raw_likelihood'],
+                 raw_score=d['raw_score'],
+                 success_rate=d['success_rate'],
+                 is_start_of_sequence=d['is_start_of_sequence'],
+                 is_end_of_sequence=d['is_end_of_sequence'],
+                 eos_token_id=d['eos_token_id'])
+        node.children = {child['token_id']:TrieNode.from_dict(child) for child in node.children}
+        for child in node.children.values():
+            child.parent = node
+        return node
+    def __repr__(self):
+        parent_token_id = 'None (Root Node)' if self.parent is None else self.parent.token_id
+        return (f"TrieNode(token_id={self.token_id}', "
+                f"raw_likelihood={self.raw_likelihood}, raw_score={self.raw_score}, children={list(self.children.keys())}, "
+                f"parent={parent_token_id}, success rate={self.success_rate})")
+class Trie:
+    def __init__(self):
+        self.root = TrieNode()
+        self.root.is_start_of_sequence = True
+    def search_last_parent(self, prefix: torch.LongTensor):
+        """
+        Search the longest prefix in the trie that matches to the input sequence of tokens 'prefix'
+        """
+        matched_prefix = []
+        current_parent = self.root
+        # Assume one batch of prefix
+        for time_step, token_id in enumerate(prefix[0]):
+            token_id = token_id.item()
+            if token_id in current_parent.children:
+                current_parent = current_parent.children[token_id]
+                matched_prefix.append(current_parent.token_id)
+            else:
+                print(
+                    f"matched prefix is {matched_prefix}; current {token_id} not found in the trie at time step {time_step}")
+                return None
+        return current_parent
+    def search(self, sequence):
+        """
+        Return the sequence of nodes that exactly matches with the input
+        """
+        node = self.root
+        nodes = []
+        for token_id in sequence:
+            if token_id not in node.children:
+                return None
+            node = node.children[token_id]
+            nodes.append(node)
+        return nodes
+    def raw_likelihood(self, sequence):
+        """
+        Return the raw likelihood (before the adjustment) of sequence
+        """
+        if isinstance(sequence, torch.Tensor):
+            sequence = sequence.tolist()
+        nodes = self.search(sequence)
+        if nodes is None:
+            return None
+        likelihood = 1
+        for node in nodes:
+            likelihood *= node.raw_likelihood
+        return likelihood
+    def json(self):
+        return json.dumps(self.root.to_dict(), indent=2)
+    @staticmethod
+    def loads(js):
+        trie = Trie()
+        trie.root = TrieNode.from_dict(json.loads(js))
+        return trie
+    def print_trie(self, node=None, prefix=None):
+        """
+        Print all the leaves in the trie
+        """
+        if node is None:
+            node = self.root
+        if prefix is None:
+            prefix = []
+        # If current node marks the end of a sequence, print the prefix as a list
+        if node.is_end_of_sequence or len(node.children) == 0:
+            print(prefix)
+        # Recursively call print_trie for all children, appending the current character/token to the prefix
+        for char, child_node in node.children.items():
+            self.print_trie(child_node, prefix + [char])
+    def has_full_information(self):
+        """
+        Checks if all paths in the trie end with an is_end_of_sequence node set to True.
+        Returns True if the trie has full information, False otherwise.
+        """
+        return self._check_full_information(self.root)
+    def _check_full_information(self, node):
+        # If the node has no children, check if it is marked as the end of a sequence
+        if not node.children:
+            return node.is_end_of_sequence
+        # Recursively check all children
+        return all(self._check_full_information(child) for child in node.children.values())
+    def print_all_nodes(self, node=None, depth=0):
+        """
+        Print all the nodes in the trie (including non-leaves)
+        """
+        if node is None:
+            node = self.root
+        # Print current node's details
+        indent = "  " * depth  # Create indentation based on the depth in the trie
+        node_details = (f"{indent}TrieNode(token_id={node.token_id}', "
+                        f"raw_likelihood={node.raw_likelihood}, raw_score={node.raw_score}, success rate={node.success_rate}, "
+                        f"children={list(node.children.keys())}, "
+                        f"parent={node.parent.token_id if node.parent else None}, "
+                        f"is_end_of_sequence={node.is_end_of_sequence})")
+        print(node_details)
+        # Recursively call print_all_nodes for all children
+        for child_node in node.children.values():
+            self.print_all_nodes(child_node, depth + 1)

transformers_gad/parser.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import argparse
+import logging
+import sys
+from typing import List
+logger = logging.getLogger(__name__)
+END_OF_ALTERNATE_MARKER = 0
+END_OF_RULE_MARKER = 0
+END_OF_GRAMMAR_MARKER = 0xFFFF
+TO_BE_FILLED_MARKER = 0
+REF_RULE_MARKER = 1
+LITERAL_MARKER = 2
+########################
+# EBNF Grammar Parsing #
+########################
+class ParseState:
+    def __init__(self):
+        self.symbol_table = {}
+        self.grammar_encoding = []  # old name: out_grammar
+    def print(self, file=sys.stdout):
+        print_grammar(file, self)
+def get_symbol_id(state: ParseState, symbol_name: str) -> int:
+    if symbol_name not in state.symbol_table:
+        state.symbol_table[symbol_name] = len(state.symbol_table)
+    return state.symbol_table[symbol_name]
+def generate_symbol_id(state: ParseState, base_name: str) -> int:
+    next_id = len(state.symbol_table)
+    state.symbol_table[base_name + "_" + str(next_id)] = next_id
+    return next_id
+def is_word_char(c: str) -> bool:
+    """
+    Check if a char is  a-z, A-Z, 0-9, -, _, i.e., chars allowed as rule names
+    Returns:
+    """
+    return c.isalnum() or c == "-" or c == "_"
+def hex_to_int(c: str) -> int:
+    """
+    Convert a hex char to int, c should be in the range of 0-9, a-f, A-F
+    case insensitive
+    Args:
+        c:  a hex char
+    Returns:
+        int: the int value of the hex char
+    """
+    if c.isdigit():
+        return int(c)
+    elif "a" <= c.lower() <= "f":
+        return ord(c.lower()) - ord("a") + 10
+    return -1
+def remove_leading_white_space(src, rm_leading_newline):
+    """
+    Skips over whitespace and comments in the input string.
+    This function processes the input string, skipping over any spaces, tabs,
+    and content following a '#' character, which denotes a comment. The parsing
+    of a comment continues until the end of the line (denoted by newline characters
+    '\r' or '\n'). If the 'rm_leading_newline' parameter is set to False, the function
+    will stop processing and return the remaining string upon encountering a
+    newline character, otherwise it will skip over newline characters as well.
+    Parameters:
+    src (str): The input string to be processed.
+    rm_leading_newline (bool): A flag indicating whether encountering a newline character
+                       should stop the parsing (False) or if it should be skipped (True).
+    Returns:
+    str: The remaining portion of the input string after skipping whitespace and comments.
+    """
+    pos = 0
+    while pos < len(src) and (src[pos].isspace() or src[pos] == "#"):
+        if src[pos] == "#":
+            while pos < len(src) and src[pos] not in ("\r", "\n"):
+                pos += 1
+        else:
+            if not rm_leading_newline and src[pos] in ("\r", "\n"):
+                break
+            pos += 1
+    return src[pos:]
+def parse_name(src) -> (str, str):
+    """
+    parse the leading name from the input string
+    Args:
+        src:  the input grammar string
+    Returns:
+        name, remaining_src
+    """
+    pos = 0
+    while pos < len(src) and is_word_char(src[pos]):
+        pos += 1
+    if pos == 0:
+        raise RuntimeError("expecting name at " + src)
+    return src[:pos], src[pos:]
+def parse_char(src) -> (str, str):
+    """
+    parse the leading char from the input string
+    :param src:
+    :return: char, remaining_src
+    """
+    # if we have a backslash, it's maybe an escape
+    if src[0] == "\\":
+        esc = src[1]
+        if esc == "x":
+            first = hex_to_int(src[2])
+            if first > -1:
+                second = hex_to_int(src[3])
+                if second > -1:
+                    return (first << 4) + second, src[4:]
+            raise RuntimeError("expecting \\xNN at " + src)
+        elif esc in ('"', "[", "]"):
+            return esc, src[2:]
+        elif esc == "r":
+            return "\r", src[2:]
+        elif esc == "n":
+            return "\n", src[2:]
+        elif esc == "t":
+            return "\t", src[2:]
+        elif esc == "\\":
+            return "\\", src[2:]
+        elif esc == "/":
+            return "\\", src[1:]
+        raise RuntimeError("unknown escape at " + src)
+    elif src:
+        return src[0], src[1:]
+    raise RuntimeError("unexpected end of input")
+def _parse_rhs_literal_string(src: str, outbuf: List[int]) -> str:
+    assert src[0] == '"', f"rule should start with '\"', but got {src[0]}"
+    remaining_src = src[1:]
+    # advance until we get an end quote or run out of input
+    while remaining_src and remaining_src[0] != '"':
+        char, remaining_src = parse_char(remaining_src)
+        outbuf.append(LITERAL_MARKER)
+        # print(f"char: {char}")
+        outbuf.append(ord(char))
+        outbuf.append(ord(char))
+    # in case we ran out of input before finding the end quote
+    if not remaining_src:
+        raise RuntimeError(f"expecting an end quote at {src},but not found")
+    # remove the end quote and return the remaining string
+    return remaining_src[1:]
+def _parse_rhs_char_ranges(src: str, outbuf: List[int]) -> str:
+    assert src[0] == "[", f"rule should start with '[', but got {src[0]}"
+    remaining_src = src[1:]
+    start_idx = len(outbuf)
+    # num chars in range - replaced at end of loop
+    outbuf.append(TO_BE_FILLED_MARKER)
+    while remaining_src and remaining_src[0] != "]":
+        char, remaining_src = parse_char(remaining_src)
+        outbuf.append(ord(char))
+        if remaining_src[0] == "-" and remaining_src[1] != "]":
+            endchar_pair, remaining_src = parse_char(remaining_src[1:])
+            outbuf.append(ord(endchar_pair))
+        else:
+            # This is the case for enumerate, e.g., [0123456789], [abcdef]
+            # Each char is considered as a range of itself, i.e., c-c
+            outbuf.append(ord(char))
+    if not remaining_src:
+        raise RuntimeError(
+            f"expecting an ] at {src},but not found, is the char range closed?"
+        )
+    # replace num chars with actual
+    outbuf[start_idx] = len(outbuf) - start_idx - 1
+    return remaining_src[1:]
+def _parse_rhs_symbol_reference(src: str, state: ParseState, outbuf: List[int]) -> str:
+    assert is_word_char(src[0]), f"rule should start with a word char, but got {src[0]}"
+    name, remaining_src = parse_name(src)
+    ref_rule_id = get_symbol_id(state, name)
+    outbuf.append(REF_RULE_MARKER)
+    outbuf.append(ref_rule_id)
+    return remaining_src
+def _parse_rhs_grouping(
+    remaining_src: str, state: ParseState, rule_name: str, outbuf: List[int]
+) -> str:
+    assert (
+        remaining_src[0] == "("
+    ), f"rule should start with '(', but got {remaining_src[0]}"
+    remaining_src = remove_leading_white_space(remaining_src[1:], True)
+    # parse nested alternates into synthesized rule
+    synthetic_rule_id = generate_symbol_id(state, rule_name)
+    remaining_src = parse_rhs(state, remaining_src, rule_name, synthetic_rule_id, True)
+    # output reference to synthesized rule
+    outbuf.append(REF_RULE_MARKER)
+    outbuf.append(synthetic_rule_id)
+    if not remaining_src or remaining_src[0] != ")":
+        raise RuntimeError("expecting ')' at " + remaining_src)
+    return remaining_src[1:]
+def _parse_rhs_repetition_operators(
+    remaining_src: str,
+    state: ParseState,
+    rule_name: str,
+    last_sym_start: int,
+    outbuf: List[int],
+) -> str:
+    assert remaining_src[0] in (
+        "*",
+        "+",
+        "?",
+    ), f"rule should start with '*', '+', or '?', but got {remaining_src[0]}"
+    out_grammar = state.grammar_encoding
+    # last_sym_start = len(outbuf)
+    # apply transformation to previous symbol (last_sym_start -
+    # end) according to rewrite rules:
+    # S* --> S' ::= S S' |
+    # S+ --> S' ::= S S' | S
+    # S? --> S' ::= S |
+    sub_rule_id = generate_symbol_id(state, rule_name)
+    out_grammar.append(sub_rule_id)
+    sub_rule_offset = len(out_grammar)
+    # placeholder for size of 1st alternate
+    out_grammar.append(TO_BE_FILLED_MARKER)
+    # add preceding symbol to generated rule
+    out_grammar.extend(outbuf[last_sym_start:])
+    if remaining_src[0] in ("*", "+"):
+        # cause generated rule to recurse
+        out_grammar.append(REF_RULE_MARKER)
+        out_grammar.append(sub_rule_id)
+    # apply actual size
+    out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
+    # mark end of 1st alternate
+    out_grammar.append(END_OF_ALTERNATE_MARKER)
+    sub_rule_offset = len(out_grammar)
+    # placeholder for size of 2nd alternate
+    out_grammar.append(TO_BE_FILLED_MARKER)
+    if remaining_src[0] == "+":
+        # add preceding symbol as alternate only for '+'
+        out_grammar.extend(outbuf[last_sym_start:])
+    # apply actual size of 2nd alternate
+    out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
+    # mark end of 2nd alternate, then end of rule
+    out_grammar.append(END_OF_ALTERNATE_MARKER)
+    out_grammar.append(END_OF_RULE_MARKER)
+    # in original rule, replace previous symbol with reference to generated rule
+    outbuf[last_sym_start:] = [REF_RULE_MARKER, sub_rule_id]
+    return remaining_src[1:]
+def parse_simple_rhs(state, rhs: str, rule_name: str, outbuf, is_nested):
+    simple_rhs_offset = len(outbuf)
+    # sequence size, will be replaced at end when known
+    outbuf.append(TO_BE_FILLED_MARKER)
+    last_sym_start = len(outbuf)
+    remaining_rhs = rhs
+    while remaining_rhs:
+        if remaining_rhs[0] == '"':  # literal string
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_literal_string(remaining_rhs, outbuf)
+        elif remaining_rhs[0] == "[":  # char range(s)
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_char_ranges(remaining_rhs, outbuf)
+        elif is_word_char(remaining_rhs[0]):  # rule reference
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_symbol_reference(remaining_rhs, state, outbuf)
+        elif remaining_rhs[0] == "(":  # grouping
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_grouping(remaining_rhs, state, rule_name, outbuf)
+        elif remaining_rhs[0] in ("*", "+", "?"):  # repetition operator
+            # No need to mark the start of the last symbol, because we already did it
+            if len(outbuf) - simple_rhs_offset - 1 == 0:
+                raise RuntimeError(
+                    "expecting preceeding item to */+/? at " + remaining_rhs
+                )
+            remaining_rhs = _parse_rhs_repetition_operators(
+                remaining_rhs, state, rule_name, last_sym_start, outbuf
+            )
+        else:
+            # case for newline, i.e., end of rule
+            assert remaining_rhs[0] in [
+                "\n",
+                "|",
+                ")",
+            ], f"rule should end with newline or '|', but got {remaining_rhs[0]}"
+            # we break here so that we call parse_rule again to parse the next rule
+            break
+        # Here we do not rm newline deliberately so that we know the rhs is ended
+        remaining_rhs = remove_leading_white_space(
+            remaining_rhs, rm_leading_newline=is_nested
+        )
+    # apply actual size of this alternate sequence
+    outbuf[simple_rhs_offset] = len(outbuf) - simple_rhs_offset
+    # mark end of alternate
+    outbuf.append(END_OF_ALTERNATE_MARKER)
+    return remaining_rhs
+def parse_rhs(state, rhs: str, rule_name, rule_id, is_nested):
+    outbuf = []
+    remaining_rhs = parse_simple_rhs(state, rhs, rule_name, outbuf, is_nested)
+    while remaining_rhs and remaining_rhs[0] == "|":
+        remaining_rhs = remove_leading_white_space(remaining_rhs[1:], True)
+        remaining_rhs = parse_simple_rhs(
+            state, remaining_rhs, rule_name, outbuf, is_nested
+        )
+    # Now we have finished parsing the rhs, we can add the rule to the grammar_encoding
+    state.grammar_encoding.append(rule_id)
+    state.grammar_encoding.extend(outbuf)
+    state.grammar_encoding.append(END_OF_RULE_MARKER)
+    return remaining_rhs
+def parse_rule(state: ParseState, rule_text: str) -> str:
+    name, remaining_rule_text = parse_name(rule_text)
+    remaining_rule_text = remove_leading_white_space(remaining_rule_text, False)
+    # check if the rule is already defined, TODO: what will happen if the rule is already defined?
+    rule_id = get_symbol_id(state, name)
+    if remaining_rule_text[:3] != "::=":
+        raise RuntimeError("expecting ::= at " + remaining_rule_text)
+    remaining_rule_text = remove_leading_white_space(remaining_rule_text[3:], True)
+    remaining_rule_text = parse_rhs(state, remaining_rule_text, name, rule_id, False)
+    if remaining_rule_text and remaining_rule_text[0] == "\r":
+        remaining_rule_text = (
+            remaining_rule_text[2:]
+            if remaining_rule_text[1] == "\n"
+            else remaining_rule_text[1:]
+        )
+    elif remaining_rule_text and remaining_rule_text[0] == "\n":
+        remaining_rule_text = remaining_rule_text[1:]
+    elif remaining_rule_text:
+        raise RuntimeError("expecting newline or end at " + remaining_rule_text)
+    return remove_leading_white_space(remaining_rule_text, True)
+def parse_ebnf(grammar_text: str) -> ParseState:
+    try:
+        state = ParseState()
+        remaining_grammar_text = remove_leading_white_space(grammar_text, True)
+        last_grammar_repr = ""
+        while remaining_grammar_text:
+            if last_grammar_repr:
+                last_parsed_rule_len = len(last_grammar_repr) - len(
+                    remaining_grammar_text
+                )
+                logger.debug(
+                    f"last_parsed_rule: {last_grammar_repr[:last_parsed_rule_len]}"
+                )
+            last_grammar_repr = remaining_grammar_text
+            remaining_grammar_text = parse_rule(state, remaining_grammar_text)
+        state.grammar_encoding.append(END_OF_GRAMMAR_MARKER)
+        return state
+    except RuntimeError as err:
+        logger.warning("error parsing grammar:", err)
+        return ParseState()
+###################################
+# EBNF Grammar Parsing ends here  #
+###################################
+def break_grammar_into_rules(grammar_encoding: List[int]) -> List[List[int]]:
+    offset = 0
+    # we loop until we reach the end of the grammar_encoding
+    rule_encodings = []
+    i = 0
+    while i < len(grammar_encoding) - 2:
+        if (
+            grammar_encoding[i] == END_OF_ALTERNATE_MARKER
+            and grammar_encoding[i + 1] == END_OF_RULE_MARKER
+        ):
+            rule_encodings.append(grammar_encoding[offset : i + 2])
+            offset = i + 2
+            # skip the END_OF_RULE_MARKER
+            # This is mandatory because if we do not skip the END_OF_RULE_MARKER
+            # we fail in the case where the next rule has rule_id 0
+            i += 1
+        i += 1
+    return rule_encodings
+def break_rule_into_elements(rule_encoding: List[int]) -> List[List[int]]:
+    rule_id = rule_encoding.pop(0)
+    end_of_rule_marker = rule_encoding.pop(-1)
+    assert (
+        end_of_rule_marker == END_OF_RULE_MARKER
+    ), f"rule should end with {END_OF_RULE_MARKER}, but got {end_of_rule_marker}"
+    offset = 0
+    elements = []
+    while offset < len(rule_encoding):
+        element_size = rule_encoding[offset]
+        assert (
+            rule_encoding[offset + element_size] == END_OF_ALTERNATE_MARKER
+        ), f"element should end with {END_OF_ALTERNATE_MARKER}, but got {rule_encoding[offset + element_size]}"
+        elements.append(rule_encoding[offset : offset + element_size + 1])
+        offset += element_size + 1
+    return elements
+def _print_annotated_grammar(file, grammar_encoding, symbol_id_names, index=0):
+    rule_id = grammar_encoding[index]
+    print(f"<{index}>{symbol_id_names[rule_id]} ::=", end=" ", file=file)
+    pos = index + 1
+    while grammar_encoding[pos]:
+        if pos - 1 > index:
+            print("|", end=" ", file=file)
+        pos += 1  # sequence size, not needed here
+        while grammar_encoding[pos]:
+            if grammar_encoding[pos] == REF_RULE_MARKER:
+                ref_rule_id = grammar_encoding[pos + 1]
+                print(
+                    f"<{pos}>{symbol_id_names[ref_rule_id]}",
+                    end=" ",
+                    file=file,
+                )
+                pos += 2
+            else:
+                print("<{}>[".format(pos), end="", file=file)
+                num_chars = grammar_encoding[pos]
+                pos += 1
+                for i in range(0, num_chars, 2):
+                    print(
+                        "{}-".format(chr(grammar_encoding[pos + i])), end="", file=file
+                    )
+                    if i + 1 < num_chars:
+                        print(
+                            "{}".format(chr(grammar_encoding[pos + i + 1])),
+                            end="",
+                            file=file,
+                        )
+                print("]", end=" ", file=file)
+                pos += num_chars
+        pos += 1
+    print(file=file)
+    return pos + 1
+def print_grammar(file, state):
+    pos = 0
+    symbol_id_names = {v: k for k, v in state.symbol_table.items()}
+    print("Grammar Rules:", file=file)
+    while (
+        pos < len(state.grammar_encoding)
+        and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
+    ):
+        pos = _print_annotated_grammar(
+            file, state.grammar_encoding, symbol_id_names, pos
+        )
+    if pos > len(state.grammar_encoding):
+        raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
+    pos = 0
+    print("\nGrammar Hex representation:", file=file)
+    while (
+        pos < len(state.grammar_encoding)
+        and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
+    ):
+        print(f"{state.grammar_encoding[pos]:04x}", end=" ", file=file)
+        pos += 1
+    if pos > len(state.grammar_encoding):
+        raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
+    else:
+        print("ffff\n")
+    print("Rules Decimal representation:", file=file)
+    # we loop until we reach the end of the grammar_encoding
+    rule_encodings = break_grammar_into_rules(state.grammar_encoding)
+    for rule_encoding in rule_encodings:
+        rule_id = rule_encoding[0]
+        print(
+            f"<{rule_id}> {break_rule_into_elements(rule_encoding)}",
+            file=file,
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parse EBNF grammar files.")
+    parser.add_argument(
+        "-g",
+        "--grammar-file",
+        nargs="?",
+        default="/nobackup2/yf/mila/GD/examples/sygus/PRE_100_bare.ebnf",
+        help="Path to the grammar file",
+    )
+    args = parser.parse_args()
+    # set logging level
+    logging.basicConfig(level=logging.DEBUG)
+    with open(args.grammar_file, "r") as file:
+        input_text = file.read()
+    parsed_grammar = parse_ebnf(input_text)
+    print("parse state:")
+    parsed_grammar.print()
+    # print(f"symbol_ids: \n{parsed_grammar.symbol_table}")
+    # start_rule_id = parsed_grammar.symbol_table["root"]
+    # DEBUG: __main__:last_parsed_rule: root: := "0"d | "1"a
+    #
+    # DEBUG: __main__:last_parsed_rule: a: := "0"c | "1"b
+    #
+    # DEBUG: __main__:last_parsed_rule: b: := "0" | "1"
+    #
+    # DEBUG: __main__:last_parsed_rule: c: := "0" | "1"
+    #
+    # DEBUG: __main__:last_parsed_rule: d: := "0"e
+    #
+    # parse state:
+    # Grammar Rules:
+    # < 0 > root: := < 2 > [0 - 0] < 5 > d | < 9 > [1 - 1] < 12 > a
+    # < 16 > a: := < 18 > [0 - 0] < 21 > c | < 25 > [1 - 1] < 28 > b
+    # < 32 > b: := < 34 > [0 - 0] | < 39 > [1 - 1]
+    # < 44 > c: := < 46 > [0 - 0] | < 51 > [1 - 1]
+    # < 56 > d: := < 58 > [0 - 0] < 61 > e
+    # < 65 > e: := < 67 > [0 - 0]
+    #
+    # Grammar Hex representation:
+    # 0000 0006 0002 0030 0030 0001 0001 0000
+    # 0006 0002 0031 0031 0001 0002 0000 0000
+    # 0002 0006 0002 0030 0030 0001 0003 0000
+    # 0006 0002 0031 0031 0001 0004 0000 0000
+    # 0004 0004 0002 0030 0030 0000 0004 0002
+    # 0031 0031 0000 0000 0003 0004 0002 0030
+    # 0030 0000 0004 0002 0031 0031 0000 0000
+    # 0001 0006 0002 0030 0030 0001 0005 0000
+    # 0000 0005 0004 0002 0030 0030 0000 0000 ffff
+    #
+    # Rules Decimal representation:
+    # < 0 > [[6, 2, 48, 48, 1, 1, 0], [6, 2, 49, 49, 1, 2, 0]]
+    # < 2 > [[6, 2, 48, 48, 1, 3, 0], [6, 2, 49, 49, 1, 4, 0]]
+    # < 4 > [[4, 2, 48, 48, 0], [4, 2, 49, 49, 0]]
+    # < 3 > [[4, 2, 48, 48, 0], [4, 2, 49, 49, 0]]
+    # < 1 > [[6, 2, 48, 48, 1, 5, 0]]
+    # < 5 > [[4, 2, 48, 48, 0]]

transformers_gad/parser_cfg.py ADDED Viewed

	@@ -0,0 +1,530 @@

+import argparse
+import logging
+import sys
+from typing import List
+logger = logging.getLogger(__name__)
+END_OF_ALTERNATE_MARKER = 0
+END_OF_RULE_MARKER = 0
+END_OF_GRAMMAR_MARKER = 0xFFFF
+TO_BE_FILLED_MARKER = 0
+REF_RULE_MARKER = 1
+LITERAL_MARKER = 2
+########################
+# EBNF Grammar Parsing #
+########################
+class ParseState:
+    def __init__(self):
+        self.symbol_table = {}
+        self.grammar_encoding = []  # old name: out_grammar
+    def print(self, file=sys.stdout):
+        print_grammar(file, self)
+def get_symbol_id(state: ParseState, symbol_name: str) -> int:
+    if symbol_name not in state.symbol_table:
+        state.symbol_table[symbol_name] = len(state.symbol_table)
+    return state.symbol_table[symbol_name]
+def generate_symbol_id(state: ParseState, base_name: str) -> int:
+    next_id = len(state.symbol_table)
+    state.symbol_table[base_name + "_" + str(next_id)] = next_id
+    return next_id
+def is_word_char(c: str) -> bool:
+    """
+    Check if a char is  a-z, A-Z, 0-9, -, _, i.e., chars allowed as rule names
+    Returns:
+    """
+    return c.isalnum() or c == "-" or c == "_"
+def hex_to_int(c: str) -> int:
+    """
+    Convert a hex char to int, c should be in the range of 0-9, a-f, A-F
+    case insensitive
+    Args:
+        c:  a hex char
+    Returns:
+        int: the int value of the hex char
+    """
+    if c.isdigit():
+        return int(c)
+    elif "a" <= c.lower() <= "f":
+        return ord(c.lower()) - ord("a") + 10
+    return -1
+def remove_leading_white_space(src, rm_leading_newline):
+    """
+    Skips over whitespace and comments in the input string.
+    This function processes the input string, skipping over any spaces, tabs,
+    and content following a '#' character, which denotes a comment. The parsing
+    of a comment continues until the end of the line (denoted by newline characters
+    '\r' or '\n'). If the 'rm_leading_newline' parameter is set to False, the function
+    will stop processing and return the remaining string upon encountering a
+    newline character, otherwise it will skip over newline characters as well.
+    Parameters:
+    src (str): The input string to be processed.
+    rm_leading_newline (bool): A flag indicating whether encountering a newline character
+                       should stop the parsing (False) or if it should be skipped (True).
+    Returns:
+    str: The remaining portion of the input string after skipping whitespace and comments.
+    """
+    pos = 0
+    while pos < len(src) and (src[pos].isspace() or src[pos] == "#"):
+        if src[pos] == "#":
+            while pos < len(src) and src[pos] not in ("\r", "\n"):
+                pos += 1
+        else:
+            if not rm_leading_newline and src[pos] in ("\r", "\n"):
+                break
+            pos += 1
+    return src[pos:]
+def parse_name(src) -> (str, str):
+    """
+    parse the leading name from the input string
+    Args:
+        src:  the input grammar string
+    Returns:
+        name, remaining_src
+    """
+    pos = 0
+    while pos < len(src) and is_word_char(src[pos]):
+        pos += 1
+    if pos == 0:
+        raise RuntimeError("expecting name at " + src)
+    return src[:pos], src[pos:]
+def parse_char(src) -> (str, str):
+    """
+    parse the leading char from the input string
+    :param src:
+    :return: char, remaining_src
+    """
+    # if we have a backslash, it's maybe an escape
+    if src[0] == "\\":
+        esc = src[1]
+        if esc == "x":
+            first = hex_to_int(src[2])
+            if first > -1:
+                second = hex_to_int(src[3])
+                if second > -1:
+                    return (first << 4) + second, src[4:]
+            raise RuntimeError("expecting \\xNN at " + src)
+        elif esc in ('"', "[", "]"):
+            return esc, src[2:]
+        elif esc == "r":
+            return "\r", src[2:]
+        elif esc == "n":
+            return "\n", src[2:]
+        elif esc == "t":
+            return "\t", src[2:]
+        raise RuntimeError("unknown escape at " + src)
+    elif src:
+        return src[0], src[1:]
+    raise RuntimeError("unexpected end of input")
+def _parse_rhs_literal_string(src: str, outbuf: List[int]) -> str:
+    assert src[0] == '"', f"rule should start with '\"', but got {src[0]}"
+    remaining_src = src[1:]
+    # advance until we get an end quote or run out of input
+    while remaining_src and remaining_src[0] != '"':
+        char, remaining_src = parse_char(remaining_src)
+        outbuf.append(LITERAL_MARKER)
+        outbuf.append(ord(char))
+        outbuf.append(ord(char))
+    # in case we ran out of input before finding the end quote
+    if not remaining_src:
+        raise RuntimeError(f"expecting an end quote at {src},but not found")
+    # remove the end quote and return the remaining string
+    return remaining_src[1:]
+def _parse_rhs_char_ranges(src: str, outbuf: List[int]) -> str:
+    assert src[0] == "[", f"rule should start with '[', but got {src[0]}"
+    remaining_src = src[1:]
+    start_idx = len(outbuf)
+    # num chars in range - replaced at end of loop
+    outbuf.append(TO_BE_FILLED_MARKER)
+    while remaining_src and remaining_src[0] != "]":
+        char, remaining_src = parse_char(remaining_src)
+        outbuf.append(ord(char))
+        if remaining_src[0] == "-" and remaining_src[1] != "]":
+            endchar_pair, remaining_src = parse_char(remaining_src[1:])
+            outbuf.append(ord(endchar_pair))
+        else:
+            # This is the case for enumerate, e.g., [0123456789], [abcdef]
+            # Each char is considered as a range of itself, i.e., c-c
+            outbuf.append(ord(char))
+    if not remaining_src:
+        raise RuntimeError(
+            f"expecting an ] at {src},but not found, is the char range closed?"
+        )
+    # replace num chars with actual
+    outbuf[start_idx] = len(outbuf) - start_idx - 1
+    return remaining_src[1:]
+def _parse_rhs_symbol_reference(src: str, state: ParseState, outbuf: List[int]) -> str:
+    assert is_word_char(src[0]), f"rule should start with a word char, but got {src[0]}"
+    name, remaining_src = parse_name(src)
+    ref_rule_id = get_symbol_id(state, name)
+    outbuf.append(REF_RULE_MARKER)
+    outbuf.append(ref_rule_id)
+    return remaining_src
+def _parse_rhs_grouping(
+    remaining_src: str, state: ParseState, rule_name: str, outbuf: List[int]
+) -> str:
+    assert (
+        remaining_src[0] == "("
+    ), f"rule should start with '(', but got {remaining_src[0]}"
+    remaining_src = remove_leading_white_space(remaining_src[1:], True)
+    # parse nested alternates into synthesized rule
+    synthetic_rule_id = generate_symbol_id(state, rule_name)
+    remaining_src = parse_rhs(state, remaining_src, rule_name, synthetic_rule_id, True)
+    # output reference to synthesized rule
+    outbuf.append(REF_RULE_MARKER)
+    outbuf.append(synthetic_rule_id)
+    if not remaining_src or remaining_src[0] != ")":
+        raise RuntimeError("expecting ')' at " + remaining_src)
+    return remaining_src[1:]
+def _parse_rhs_repetition_operators(
+    remaining_src: str,
+    state: ParseState,
+    rule_name: str,
+    last_sym_start: int,
+    outbuf: List[int],
+) -> str:
+    assert remaining_src[0] in (
+        "*",
+        "+",
+        "?",
+    ), f"rule should start with '*', '+', or '?', but got {remaining_src[0]}"
+    out_grammar = state.grammar_encoding
+    # last_sym_start = len(outbuf)
+    # apply transformation to previous symbol (last_sym_start -
+    # end) according to rewrite rules:
+    # S* --> S' ::= S S' |
+    # S+ --> S' ::= S S' | S
+    # S? --> S' ::= S |
+    sub_rule_id = generate_symbol_id(state, rule_name)
+    out_grammar.append(sub_rule_id)
+    sub_rule_offset = len(out_grammar)
+    # placeholder for size of 1st alternate
+    out_grammar.append(TO_BE_FILLED_MARKER)
+    # add preceding symbol to generated rule
+    out_grammar.extend(outbuf[last_sym_start:])
+    if remaining_src[0] in ("*", "+"):
+        # cause generated rule to recurse
+        out_grammar.append(REF_RULE_MARKER)
+        out_grammar.append(sub_rule_id)
+    # apply actual size
+    out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
+    # mark end of 1st alternate
+    out_grammar.append(END_OF_ALTERNATE_MARKER)
+    sub_rule_offset = len(out_grammar)
+    # placeholder for size of 2nd alternate
+    out_grammar.append(TO_BE_FILLED_MARKER)
+    if remaining_src[0] == "+":
+        # add preceding symbol as alternate only for '+'
+        out_grammar.extend(outbuf[last_sym_start:])
+    # apply actual size of 2nd alternate
+    out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
+    # mark end of 2nd alternate, then end of rule
+    out_grammar.append(END_OF_ALTERNATE_MARKER)
+    out_grammar.append(END_OF_RULE_MARKER)
+    # in original rule, replace previous symbol with reference to generated rule
+    outbuf[last_sym_start:] = [REF_RULE_MARKER, sub_rule_id]
+    return remaining_src[1:]
+def parse_simple_rhs(state, rhs: str, rule_name: str, outbuf, is_nested):
+    simple_rhs_offset = len(outbuf)
+    # sequence size, will be replaced at end when known
+    outbuf.append(TO_BE_FILLED_MARKER)
+    last_sym_start = len(outbuf)
+    remaining_rhs = rhs
+    while remaining_rhs:
+        if remaining_rhs[0] == '"':  # literal string
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_literal_string(remaining_rhs, outbuf)
+        elif remaining_rhs[0] == "[":  # char range(s)
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_char_ranges(remaining_rhs, outbuf)
+        elif is_word_char(remaining_rhs[0]):  # rule reference
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_symbol_reference(remaining_rhs, state, outbuf)
+        elif remaining_rhs[0] == "(":  # grouping
+            # mark the start of the last symbol, for repetition operator
+            last_sym_start = len(outbuf)
+            remaining_rhs = _parse_rhs_grouping(remaining_rhs, state, rule_name, outbuf)
+        elif remaining_rhs[0] in ("*", "+", "?"):  # repetition operator
+            # No need to mark the start of the last symbol, because we already did it
+            if len(outbuf) - simple_rhs_offset - 1 == 0:
+                raise RuntimeError(
+                    "expecting preceeding item to */+/? at " + remaining_rhs
+                )
+            remaining_rhs = _parse_rhs_repetition_operators(
+                remaining_rhs, state, rule_name, last_sym_start, outbuf
+            )
+        else:
+            # case for newline, i.e., end of rule
+            assert remaining_rhs[0] in [
+                "\n",
+                "|",
+                ")",
+            ], f"rule should end with newline or '|', but got {remaining_rhs[0]}"
+            # we break here so that we call parse_rule again to parse the next rule
+            break
+        # Here we do not rm newline deliberately so that we know the rhs is ended
+        remaining_rhs = remove_leading_white_space(
+            remaining_rhs, rm_leading_newline=is_nested
+        )
+    # apply actual size of this alternate sequence
+    outbuf[simple_rhs_offset] = len(outbuf) - simple_rhs_offset
+    # mark end of alternate
+    outbuf.append(END_OF_ALTERNATE_MARKER)
+    return remaining_rhs
+def parse_rhs(state, rhs: str, rule_name, rule_id, is_nested):
+    outbuf = []
+    remaining_rhs = parse_simple_rhs(state, rhs, rule_name, outbuf, is_nested)
+    while remaining_rhs and remaining_rhs[0] == "|":
+        remaining_rhs = remove_leading_white_space(remaining_rhs[1:], True)
+        remaining_rhs = parse_simple_rhs(
+            state, remaining_rhs, rule_name, outbuf, is_nested
+        )
+    # Now we have finished parsing the rhs, we can add the rule to the grammar_encoding
+    state.grammar_encoding.append(rule_id)
+    state.grammar_encoding.extend(outbuf)
+    state.grammar_encoding.append(END_OF_RULE_MARKER)
+    return remaining_rhs
+def parse_rule(state: ParseState, rule_text: str) -> str:
+    name, remaining_rule_text = parse_name(rule_text)
+    remaining_rule_text = remove_leading_white_space(remaining_rule_text, False)
+    # check if the rule is already defined, TODO: what will happen if the rule is already defined?
+    rule_id = get_symbol_id(state, name)
+    if remaining_rule_text[:3] != "::=":
+        raise RuntimeError("expecting ::= at " + remaining_rule_text)
+    remaining_rule_text = remove_leading_white_space(remaining_rule_text[3:], True)
+    remaining_rule_text = parse_rhs(state, remaining_rule_text, name, rule_id, False)
+    if remaining_rule_text and remaining_rule_text[0] == "\r":
+        remaining_rule_text = (
+            remaining_rule_text[2:]
+            if remaining_rule_text[1] == "\n"
+            else remaining_rule_text[1:]
+        )
+    elif remaining_rule_text and remaining_rule_text[0] == "\n":
+        remaining_rule_text = remaining_rule_text[1:]
+    elif remaining_rule_text:
+        raise RuntimeError("expecting newline or end at " + remaining_rule_text)
+    return remove_leading_white_space(remaining_rule_text, True)
+def parse_ebnf(grammar_text: str) -> ParseState:
+    try:
+        state = ParseState()
+        remaining_grammar_text = remove_leading_white_space(grammar_text, True)
+        last_grammar_repr = ""
+        while remaining_grammar_text:
+            if last_grammar_repr:
+                last_parsed_rule_len = len(last_grammar_repr) - len(
+                    remaining_grammar_text
+                )
+                logger.debug(
+                    f"last_parsed_rule: {last_grammar_repr[:last_parsed_rule_len]}"
+                )
+            last_grammar_repr = remaining_grammar_text
+            remaining_grammar_text = parse_rule(state, remaining_grammar_text)
+        state.grammar_encoding.append(END_OF_GRAMMAR_MARKER)
+        return state
+    except RuntimeError as err:
+        logger.warning("error parsing grammar:", err)
+        return ParseState()
+###################################
+# EBNF Grammar Parsing ends here  #
+###################################
+def break_grammar_into_rules(grammar_encoding: List[int]) -> List[List[int]]:
+    offset = 0
+    # we loop until we reach the end of the grammar_encoding
+    rule_encodings = []
+    i = 0
+    while i < len(grammar_encoding) - 2:
+        if (
+            grammar_encoding[i] == END_OF_ALTERNATE_MARKER
+            and grammar_encoding[i + 1] == END_OF_RULE_MARKER
+        ):
+            rule_encodings.append(grammar_encoding[offset : i + 2])
+            offset = i + 2
+            # skip the END_OF_RULE_MARKER
+            # This is mandatory because if we do not skip the END_OF_RULE_MARKER
+            # we fail in the case where the next rule has rule_id 0
+            i += 1
+        i += 1
+    return rule_encodings
+def break_rule_into_elements(rule_encoding: List[int]) -> List[List[int]]:
+    rule_id = rule_encoding.pop(0)
+    end_of_rule_marker = rule_encoding.pop(-1)
+    assert (
+        end_of_rule_marker == END_OF_RULE_MARKER
+    ), f"rule should end with {END_OF_RULE_MARKER}, but got {end_of_rule_marker}"
+    offset = 0
+    elements = []
+    while offset < len(rule_encoding):
+        element_size = rule_encoding[offset]
+        assert (
+            rule_encoding[offset + element_size] == END_OF_ALTERNATE_MARKER
+        ), f"element should end with {END_OF_ALTERNATE_MARKER}, but got {rule_encoding[offset + element_size]}"
+        elements.append(rule_encoding[offset : offset + element_size + 1])
+        offset += element_size + 1
+    return elements
+def _print_annotated_grammar(file, grammar_encoding, symbol_id_names, index=0):
+    rule_id = grammar_encoding[index]
+    print(f"<{index}>{symbol_id_names[rule_id]} ::=", end=" ", file=file)
+    pos = index + 1
+    while grammar_encoding[pos]:
+        if pos - 1 > index:
+            print("|", end=" ", file=file)
+        pos += 1  # sequence size, not needed here
+        while grammar_encoding[pos]:
+            if grammar_encoding[pos] == REF_RULE_MARKER:
+                ref_rule_id = grammar_encoding[pos + 1]
+                print(
+                    f"<{pos}>{symbol_id_names[ref_rule_id]}",
+                    end=" ",
+                    file=file,
+                )
+                pos += 2
+            else:
+                print("<{}>[".format(pos), end="", file=file)
+                num_chars = grammar_encoding[pos]
+                pos += 1
+                for i in range(0, num_chars, 2):
+                    print(
+                        "{}-".format(chr(grammar_encoding[pos + i])), end="", file=file
+                    )
+                    if i + 1 < num_chars:
+                        print(
+                            "{}".format(chr(grammar_encoding[pos + i + 1])),
+                            end="",
+                            file=file,
+                        )
+                print("]", end=" ", file=file)
+                pos += num_chars
+        pos += 1
+    print(file=file)
+    return pos + 1
+def print_grammar(file, state):
+    pos = 0
+    symbol_id_names = {v: k for k, v in state.symbol_table.items()}
+    print("Grammar Rules:", file=file)
+    while (
+        pos < len(state.grammar_encoding)
+        and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
+    ):
+        pos = _print_annotated_grammar(
+            file, state.grammar_encoding, symbol_id_names, pos
+        )
+    if pos > len(state.grammar_encoding):
+        raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
+    pos = 0
+    print("\nGrammar Hex representation:", file=file)
+    while (
+        pos < len(state.grammar_encoding)
+        and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
+    ):
+        print(f"{state.grammar_encoding[pos]:04x}", end=" ", file=file)
+        pos += 1
+    if pos > len(state.grammar_encoding):
+        raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
+    else:
+        print("ffff\n")
+    print("Rules Decimal representation:", file=file)
+    # we loop until we reach the end of the grammar_encoding
+    rule_encodings = break_grammar_into_rules(state.grammar_encoding)
+    for rule_encoding in rule_encodings:
+        rule_id = rule_encoding[0]
+        print(
+            f"<{rule_id}> {break_rule_into_elements(rule_encoding)}",
+            file=file,
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parse EBNF grammar files.")
+    parser.add_argument(
+        "-g",
+        "--grammar-file",
+        nargs="?",
+        default="examples/grammars/json.ebnf",
+        help="Path to the grammar file (default: examples/grammars/json.ebnf)",
+    )
+    args = parser.parse_args()
+    # set logging level
+    logging.basicConfig(level=logging.DEBUG)
+    with open(args.grammar_file, "r") as file:
+        input_text = file.read()
+    parsed_grammar = parse_ebnf(input_text)
+    parsed_grammar.print()
+    print(f"symbol_ids: \n{parsed_grammar.symbol_table}")
+    start_rule_id = parsed_grammar.symbol_table["root"]

transformers_gad/recognizer.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import copy
+import logging
+from functools import lru_cache
+from typing import List, Tuple, Dict
+from transformers_gad.parser import (
+    END_OF_RULE_MARKER,
+    END_OF_ALTERNATE_MARKER,
+    parse_ebnf,
+    REF_RULE_MARKER,
+)
+from transformers_gad.utf8_utils import PartialUTF8, decode_utf8
+from transformers_gad.utils import intervals_intersect
+import logging
+class AcceptState:
+    def __init__(self, stacks, partial_utf8):
+        self.stacks = stacks
+        self.partial_utf8 = partial_utf8
+    @staticmethod
+    def empty_state():
+        return AcceptState([], PartialUTF8())
+class StringRecognizer:
+    def __init__(
+        self,
+        grammar_encoding: List[int],
+        start_rule_id: int = None,
+        rule_offsets: List[int] = None,
+        stacks: List[List[int]] = None,
+    ):
+        # strictly speaking, we don't need to copy grammar_encoding because we don't modify it
+        # but we do it anyway to be safe
+        # in case where the grammar is very large, we can consider not copying it
+        self.grammar_encoding = grammar_encoding
+        if rule_offsets is not None:
+            self.rule_offsets = rule_offsets
+        else:
+            if start_rule_id is None:
+                raise ValueError("start_rule_id cannot be None if rule_offsets is None")
+            self.rule_offsets = self.init_rules(start_rule_id)
+        # each stack is a list of indices into grammar_encoding
+        # each index points to a rule's
+        if stacks is not None:
+            self.stacks = stacks
+        else:
+            if start_rule_id is None:
+                raise ValueError("start_rule_id cannot be None if stacks is None")
+            self.stacks: List[List[int]] = self.init_stack(start_rule_id)
+        self.start_rule_id = start_rule_id
+    def init_rules(self, start_rule_id: int) -> List[int]:
+        _rule_offset = 0
+        rule_offsets = []
+        # Build `rules` as an array of rule IDs to their positions in `grammar_src`
+        while self.grammar_encoding[_rule_offset] != 0xFFFF:
+            rule_id = self.grammar_encoding[_rule_offset]
+            # store the offset idx
+            if len(rule_offsets) <= rule_id:
+                rule_offsets.extend([-1] * (rule_id - len(rule_offsets) + 1))
+            rule_offsets[rule_id] = _rule_offset
+            # Skip rule ID
+            # _rule_offset += 1
+            simple_rhs_offset = _rule_offset + 1
+            # Skip rule alternates
+            while self.grammar_encoding[simple_rhs_offset] != END_OF_RULE_MARKER:
+                simple_rhs_offset = (
+                    simple_rhs_offset + 1 + self.grammar_encoding[simple_rhs_offset]
+                )
+            # Skip 0 denoting end of rule
+            # _rule_offset += 1
+            _rule_offset = simple_rhs_offset + 1
+        retrieved_start_rule_id = self.grammar_encoding[rule_offsets[start_rule_id]]
+        assert retrieved_start_rule_id == start_rule_id
+        return rule_offsets
+    def init_stack(self, start_rule_id: int) -> List[List[int]]:
+        stacks = []
+        # Loop over alternates of start rule to build initial stacks
+        sub_rhs_offset = self.rule_offsets[start_rule_id] + 1
+        while self.grammar_encoding[sub_rhs_offset]:
+            stack: List[int] = []
+            # If alternate is nonempty, add to stack
+            element_offset = sub_rhs_offset + 1
+            if self.grammar_encoding[element_offset] != END_OF_ALTERNATE_MARKER:
+                stack.append(element_offset)
+            stacks.extend(self.advance_stack(tuple(stack)))
+            sub_rhs_offset += 1 + self.grammar_encoding[sub_rhs_offset]
+        return stacks
+    def get_initial_accept_state(self) -> AcceptState:
+        return AcceptState(self.init_stack(self.start_rule_id), PartialUTF8())
+    def get_termination_accept_state(self) -> AcceptState:
+        return AcceptState([], PartialUTF8())
+    @lru_cache(maxsize=32768)
+    def advance_stack(self, stack: Tuple[int]) -> List[List[int]]:
+        stack = list(stack)
+        if len(stack) == 0:
+            return [stack]
+        # we get the last element of the stack, which is the element we are currently processing
+        cur_element_offset = stack[-1]
+        # if the element is a terminal, we don't need to advance the stack
+        if self.grammar_encoding[cur_element_offset] != REF_RULE_MARKER:
+            return [stack]
+        # the remaining case is that the element is a non-terminal, i.e. a reference to another rule
+        else:
+            ref_rule_id = self.grammar_encoding[cur_element_offset + 1]
+            # find the offset of the referenced rule
+            ref_subrule_offset = self.rule_offsets[ref_rule_id] + 1
+            new_stacks: List[List[int]] = []
+            # Loop over alternates of referenced rule to build new stacks
+            while self.grammar_encoding[ref_subrule_offset] != END_OF_RULE_MARKER:
+                # copy the original stack without the last element
+                new_stack = stack[:-1]
+                # if the rule ref is followed by another element, we add it to the stack
+                next_element_offset = cur_element_offset + 2
+                if (
+                    self.grammar_encoding[next_element_offset]
+                    != END_OF_ALTERNATE_MARKER
+                ):
+                    new_stack.append(next_element_offset)
+                # if the referenced rule is not empty, we add its element offset to the stack
+                ref_element_offset = ref_subrule_offset + 1
+                if self.grammar_encoding[ref_element_offset] != END_OF_ALTERNATE_MARKER:
+                    new_stack.append(ref_element_offset)
+                new_stacks.extend(self.advance_stack(tuple(new_stack)))
+                ref_subrule_offset += self.grammar_encoding[ref_subrule_offset] + 1
+            return new_stacks
+    def _consume_byte(self, byte: int, accept_state: AcceptState):
+        # suppose we have code point 一, ord('一') = 19968, we need to match 3 bytes
+        # we need to match 3 bytes, so we need to call _consume_byte_partial_match 3 times
+        self._consume_bytes(bytes([byte]), accept_state)
+    # @lru_cache(maxsize=32768)
+    def _probe_bytes(
+        self,
+        byte_seq: bytes,
+        stacks: List[List[int]],
+        partial_utf8: PartialUTF8,
+        verbose=True,
+    ):
+        if type(byte_seq) is list:
+            byte_seq = bytes(byte_seq)
+        code_points, new_partial_utf8 = decode_utf8(byte_seq, partial_utf8)
+        if verbose:
+            logging.debug(
+                f"code_points: {code_points}; new_partial_utf8: {new_partial_utf8}"
+            )
+        new_stacks = self._consume_code_points(code_points, stacks)
+        for stack in new_stacks:
+            # stack is empty, meaning that the variables are all consumed
+            if len(stack) == 0:
+                return True
+            element_offset = stack[-1]
+            if self.partial_utf8_accept_at_element(element_offset, new_partial_utf8):
+                return True
+        return False
+    def _consume_bytes(
+        self,
+        byte_seq: bytes,
+        accept_state: AcceptState = None,
+        verbose=True,
+    ):
+        if accept_state is None:
+            accept_state = self.get_initial_accept_state()
+        stacks = accept_state.stacks
+        partial_utf8 = accept_state.partial_utf8
+        if type(byte_seq) is list:
+            byte_seq = bytes(byte_seq)
+        code_points, new_partial_utf8 = decode_utf8(byte_seq, partial_utf8)
+        if verbose:
+            logging.debug(
+                f"code_points: {code_points}; new_partial_utf8: {new_partial_utf8}"
+            )
+        new_stacks = self._consume_code_points(code_points, stacks)
+        new_new_stacks = []
+        for stack in new_stacks:
+            if len(stack) == 0:
+                continue
+            element_offset = stack[-1]
+            if self.partial_utf8_accept_at_element(element_offset, new_partial_utf8):
+                new_new_stacks.append(stack)
+        return AcceptState(new_new_stacks, new_partial_utf8)
+    ##########################
+    #
+    # Code point recognition
+    #
+    ##########################
+    @lru_cache(maxsize=30000)
+    def _consume_code_point(
+        self, code_point: int, stacks: Tuple[Tuple[int]]
+    ) -> List[List[int]]:
+        """
+        consume a character from the stack
+        char_code_point: can be a Unicode code point, including ascii code points which are in the range [0, 127]
+        """
+        new_stacks = []
+        stacks: List[List[int]] = list([list(stack) for stack in stacks])
+        if code_point == 0:
+            return new_stacks
+        for stack in stacks:
+            new_stacks.extend(
+                self._consume_code_point_per_stack(code_point, tuple(stack))
+            )
+        return new_stacks
+    @lru_cache(maxsize=30000)
+    def _consume_code_point_per_stack(
+        self, code_point: int, stack: Tuple[int]
+    ) -> List[List[int]]:
+        """
+        consume a character from the stack
+        char_code_point: can be a Unicode code point, including ascii code points which are in the range [0, 127]
+        """
+        # TODO, the below code will raise an error when the stack is empty, but why is this happening?
+        # if len(stacks) == 0:
+        #     raise ValueError("Stacks don't contain any stack, meaning that no character can be consumed")
+        # code_point = 0 is a special case when the uf8 sequence is not complete, we return an empty stack
+        # to indicate that the character is not accepted
+        stack = list(stack)
+        new_stacks = []
+        if code_point == 0:
+            return new_stacks
+        # stack is empty
+        if len(stack) == 0:
+            return new_stacks
+        element_offset = stack[-1]
+        found = self.accept_code_point_at_element(code_point, element_offset)
+        if not found:
+            return new_stacks
+        size = self.grammar_encoding[element_offset]
+        element_offset += size + 1
+        new_stack = stack[:-1]
+        if self.grammar_encoding[element_offset]:
+            new_stack.append(element_offset)
+        return self.advance_stack(tuple(new_stack))
+    def _consume_code_points(
+        self, code_points: List[int], stacks: List[List[int]], verbose=False
+    ) -> List[List[int]]:
+        for i, code_point in enumerate(code_points):
+            # for lru_cache to work, we need to convert the list of stacks into a tuple of stacks
+            tuple_stacks: Tuple[Tuple[int]] = tuple([tuple(stack) for stack in stacks])
+            stacks = self._consume_code_point(code_point, tuple_stacks)
+            if len(stacks) > 0 and verbose:
+                accepted_code_point = code_points[: i + 1]
+                corresponding_char = chr(code_point)
+                logging.debug(
+                    f"code point {accepted_code_point} corresponding to {corresponding_char} is accepted"
+                )
+        return stacks
+    def _accept_code_points(
+        self, code_points: List[int], stacks: List[List[int]], verbose=False
+    ) -> bool:
+        stacks = self._consume_code_points(code_points, stacks, verbose)
+        return len(stacks) > 0
+    @lru_cache(maxsize=30000)
+    def accept_code_point_at_element(
+        self, code_point: int, element_offset: int
+    ) -> bool:
+        size = self.grammar_encoding[element_offset]
+        # to make idx point to the range_start of the first range
+        element_offset += 1
+        for i in range(0, size, 2):
+            if (
+                self.grammar_encoding[element_offset + i]
+                <= code_point
+                <= self.grammar_encoding[element_offset + i + 1]
+            ):
+                return True
+        return False
+    # def _accept_code_point(self, code_point: int, stacks: List[List[int]]):
+    #     # for lru_cache to work, we need to convert the list of stacks into a tuple of stacks
+    #     tuple_stacks: Tuple[Tuple[int]] = tuple([tuple(stack) for stack in stacks])
+    #     new_stacks: List[List[int]] = self._consume_code_point(code_point, tuple_stacks)
+    #     return len(new_stacks) > 0
+    #############################
+    #
+    # Partial UTF-8 recognition
+    #
+    #############################
+    def partial_utf8_accept_at_element(
+        self, element_offset: int, partial_utf8: PartialUTF8
+    ) -> bool:
+        # Extract the accumulated value and the number of remaining bytes from the partial_utf8 object.
+        partial_value = partial_utf8.value
+        n_remain = partial_utf8.n_remain
+        # Return False if there are no remaining bytes to process or if it's an invalid UTF-8 sequence.
+        if n_remain == 1 and partial_value < 2:
+            return False
+        # If there are no remaining bytes, this means we had already consumed a complete UTF-8 sequence.
+        if n_remain <= 0:
+            return True
+        # Calculate the lowest possible Unicode code point that can be formed with the remaining bytes.
+        low = partial_value << (n_remain * 6)
+        # Calculate the highest possible Unicode code point by setting all remaining bits to 1.
+        high = low | ((1 << (n_remain * 6)) - 1)
+        # If the low end of the range is 0 and a specific number of bytes remain, adjust low to the minimum value
+        # that can be represented with that number of bytes. This accounts for UTF-8 encoding rules.
+        if low == 0:
+            if n_remain == 2:
+                low = 1 << 11  # Minimum value representable with 2 additional bytes.
+            elif n_remain == 3:
+                low = 1 << 16  # Minimum value representable with 3 additional bytes.
+        # Get the size of the grammar rule starting at the current element_offset.
+        size = self.grammar_encoding[element_offset]
+        # Move the element_offset to the start of the grammar rule's definition.
+        element_offset += 1
+        # Iterate over the grammar rule, checking if the range defined by low-high overlaps with any specified ranges.
+        for i in range(0, size, 2):
+            # If the current range (specified in the grammar encoding) overlaps with the low-high range, return True.
+            if intervals_intersect(
+                low,
+                high,
+                self.grammar_encoding[element_offset + i],
+                self.grammar_encoding[element_offset + i + 1],
+            ):
+                return True
+        # If no overlap is found with any of the ranges, return False, indicating no valid partial match.
+        return False
+    #############################
+    #
+    # String recognition
+    #
+    #############################
+    def _consume_string(self, string: str, accept_state: AcceptState):
+        # _bytes = bytes(string, "utf-8")
+        code_points = [ord(char) for char in string]
+        stacks = self._consume_code_points(code_points, accept_state.stacks)
+        return AcceptState(stacks, accept_state.partial_utf8)
+    def _accept_prefix(self, string: str, accept_state: AcceptState = None):
+        if accept_state is None:
+            accept_state = self.get_initial_accept_state()
+        new_accept_state = self._consume_string(string, accept_state)
+        return len(new_accept_state.stacks) > 0
+    def _accept_string(self, string: str, accept_state: AcceptState = None):
+        if accept_state is None:
+            accept_state = self.get_initial_accept_state()
+        new_accept_state = self._consume_string(string, accept_state)
+        at_least_one_stack_is_empty = any(
+            len(stack) == 0 for stack in new_accept_state.stacks
+        )
+        return at_least_one_stack_is_empty
+    def _can_stop(self, stacks: List[List[int]]):
+        # This happens in practice, but maybe it shouldn't? TODO
+        if len(stacks) == 0:
+            return True
+        # if any of the stack is empty, we can stop
+        for stack in stacks:
+            if len(stack) == 0:
+                return True
+        else:
+            return False
+    def _must_stop(self, stacks: List[List[int]]):
+        return len(stacks) == 0 or all(len(stack) == 0 for stack in stacks)
+    #############################
+    #
+    # Not Used
+    #
+    #############################
+    # For each sub-rule in the grammar, cache whether each byte is accepted.
+    @lru_cache(maxsize=None)
+    def char_acceptance_at_element(self, element_offset):
+        """
+        Caches and returns a dictionary indicating whether a Unicode character is accepted
+        at a given rule position. This function considers Unicode characters, dynamically
+        inserting accepted ranges into a dictionary to optimize memory usage.
+        Args:
+        - rule_offset: The offset in the grammar encoding where the rule starts.
+        Returns:
+        - A dictionary where each key is a Unicode character (or range) and the value is True if accepted.
+        """
+        logging.debug(f"element_offset: {element_offset}")
+        acceptance = {}
+        num_chars = self.grammar_encoding[element_offset]
+        element_offset += 1
+        for i in range(0, num_chars, 2):
+            start = self.grammar_encoding[element_offset + i]
+            end = self.grammar_encoding[element_offset + i + 1]
+            for j in range(start, end + 1):
+                acceptance[j] = True
+        logging.debug(acceptance)
+        return acceptance
+    def _consume_code_points_new(
+        self, code_points: List[int], stacks: List[List[int]], verbose=False
+    ) -> List[List[int]]:
+        new_stacks: List[List[int]] = []
+        for stack in stacks:
+            new_stacks.extend(
+                self._consume_code_points_per_stack(
+                    tuple(code_points), tuple(stack), verbose
+                )
+            )
+        return new_stacks
+    @lru_cache(maxsize=30000)
+    def _consume_code_points_per_stack(
+        self, code_points: Tuple[int], stack: Tuple[int], verbose=False
+    ) -> List[List[int]]:
+        code_points = list(code_points)
+        stacks = (stack,)
+        for i, code_point in enumerate(code_points):
+            # for lru_cache to work, we need to convert the list of stacks into a tuple of stacks
+            stacks = self._consume_code_point(code_point, stacks)
+            stacks = tuple([tuple(stack) for stack in stacks])
+        return [list(stack) for stack in stacks]

transformers_gad/token_grammar_recognizer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import copy
+import logging
+from abc import ABC
+from functools import lru_cache
+from typing import List
+import torch
+from transformers_gad.recognizer import StringRecognizer, AcceptState
+from transformers_gad.parser import parse_ebnf
+from transformers_gad.trie import ByteTrie
+from transformers_gad.utf8_utils import PartialUTF8
+from .vocab_struct import LEAF, TokenTrie
+from transformers_gad.mapping import get_mapping
+logger = logging.getLogger(__name__)
+class AbsTokenRecognizer(ABC):
+    def __init__(self, grammar_str, tokenizer, start_rule_name="root", unicode=False):
+        parsed_grammar = parse_ebnf(grammar_str)
+        grammar_encoding = parsed_grammar.grammar_encoding
+        self.start_rule_id = parsed_grammar.symbol_table.get(start_rule_name)
+        self.byte_encoding = unicode
+        if unicode and not tokenizer.__class__.__name__.lower().startswith(
+            "gpt2"
+        ):  # gpt2tokenizer or gpt2tokenizerfast
+            raise ValueError(
+                "Constrained decoding with unicode is only supported for GPT2 model. Support for other models is coming soon."
+                "Or you can use the constraints with only ascii characters."
+            )
+        self.eos_token_id = tokenizer.eos_token_id
+        self.token_trie = TokenTrie(tokenizer)
+        self.tokenizer = tokenizer
+        self.string_recognizer = StringRecognizer(grammar_encoding, self.start_rule_id)
+        self.unicode_trie = ByteTrie.from_tokenizer(tokenizer, unicode=unicode)
+        self.mapping = get_mapping(tokenizer, unicode=unicode)
+        assert len(self.mapping) == len(
+            self.token_trie
+        ), f"{len(self.mapping)}, {len(self.token_trie)}"
+    def _consume_token_id(
+        self, token_id: int, accept_state: AcceptState
+    ) -> AcceptState:
+        if self.string_recognizer._must_stop(accept_state.stacks):
+            if token_id == self.eos_token_id:
+                return self.string_recognizer.get_termination_accept_state()
+            else:
+                raise ValueError(
+                    f"All stacks are empty, so the only token accepted is EOS({self.eos_token_id}), but got {token_id}"
+                )
+        if token_id == self.eos_token_id:
+            if self.string_recognizer._can_stop(accept_state.stacks):
+                # if at least one of the stack is empty, we can stop
+                # we clear all the stacks, meaning that we don't accept any token after EOS
+                return self.string_recognizer.get_termination_accept_state()
+            else:
+                raise ValueError(
+                    f"At least one of the stack should be empty when EOS is reached. However, "
+                    f"the stacks are {accept_state.stacks}"
+                )
+        bytes_or_codepoints = self.mapping.map(token_id)
+        accept_state = self.string_recognizer._consume_bytes(
+            bytes_or_codepoints, accept_state
+        )
+        return accept_state
+    def probe_token_id(self, token_id: int, accept_state: AcceptState) -> bool:
+        stacks = accept_state.stacks
+        if self.string_recognizer._must_stop(stacks):
+            if token_id == self.eos_token_id:
+                return True
+            else:
+                return False
+        if token_id == self.eos_token_id:
+            if self.string_recognizer._can_stop(stacks):
+                # if at least one of the stack is empty, we can stop
+                # we clear all the stacks, meaning that we don't accept any token after EOS
+                return True
+            else:
+                return False
+        # for code_point in self.mapping.map(token_id):
+        #     stacks = self.grammar._consume_char_code_point(code_point, stacks)
+        bytes_or_codepoints = self.mapping.map(token_id, verbose=False)
+        new_acc_state = self.string_recognizer._consume_bytes(
+            bytes_or_codepoints, accept_state, verbose=False
+        )
+        return len(new_acc_state.stacks) > 0
+    def advance_token_ids(self, *args, **kwargs):
+        """Process a list of tokens according to the grammar rules."""
+        raise NotImplementedError
+    def batch_filter_vocab(self, batch_accept_states, device) -> torch.Tensor:
+        batch_acceptance = []
+        for accept_state in batch_accept_states:
+            batch_acceptance.append(self.filter_vocab(accept_state, device))
+        return torch.stack(batch_acceptance)
+    def filter_vocab(self, accept_state, device) -> torch.Tensor:
+        if not accept_state.stacks:  # Check if stacks is empty
+            # Handle the empty case: for example, return a tensor of False
+            # The size of the tensor should match the size of your vocabulary
+            vocab_size = len(self.mapping)
+            logger.debug(f"Empty stack, sum of acceptance: {0}")
+            # size of the vocab
+            accepts = [False] * vocab_size
+            accepts[self.eos_token_id] = True
+            return torch.tensor(accepts, dtype=torch.bool, device=device)
+        acceptance = self.get_token_acceptance(accept_state, device)
+        return acceptance
+    def get_token_acceptance(self, accept_state, device) -> torch.Tensor:
+        acceptance_matrix = torch.cat(
+            [
+                self.get_token_acceptance_array_for_stack(
+                    tuple(stack), accept_state.partial_utf8, device
+                )
+                for stack in accept_state.stacks
+            ]
+        )
+        # Merge stacks: any True => True
+        acceptance = acceptance_matrix.reshape(len(accept_state.stacks), -1).any(dim=0)
+        return acceptance
+    @lru_cache(maxsize=32768)
+    def get_token_acceptance_array_for_stack(self, stack, partial_utf8, device):
+        # stack = list(stack)  # needs to come in as a tuple for lru_cache
+        assert isinstance(stack, tuple)
+        stack = list(stack)
+        if self.byte_encoding:
+            accept_f = lambda x: self.string_recognizer._probe_bytes(
+                x, [stack], partial_utf8=partial_utf8
+            )
+            token_acceptance = self.unicode_trie.get_token_acceptance(
+                accept=accept_f, accept_eos=False, eos_token_id=self.eos_token_id
+            )
+        else:
+            accepts = [False] * len(self.mapping)
+            token_acceptance = check_token_acceptance_in_trie(
+                self.token_trie.trie,
+                [stack],
+                self.string_recognizer,
+                self.eos_token_id,
+                accepts,
+            )
+        x = torch.tensor(token_acceptance, dtype=torch.bool, device=device)
+        x_eos = self.validate_and_set_eos_acceptance(x)
+        return x_eos
+    def validate_and_set_eos_acceptance(self, acceptance: torch.Tensor) -> torch.Tensor:
+        if torch.any(acceptance) == 0:
+            acceptance[self.eos_token_id] = True
+        else:
+            if acceptance[self.eos_token_id]:
+                raise ValueError()
+            acceptance[self.eos_token_id] = False
+        return acceptance
+class IncrementalTokenRecognizer(AbsTokenRecognizer):
+    def __init__(self, grammar_str, start_rule_name, tokenizer, unicode=False):
+        super().__init__(grammar_str, tokenizer, start_rule_name, unicode)
+        self.last_size = None
+        self.is_incremental = True
+        # if self.last_size is not set (which would be the case when processing the first token).
+        # In this case, do nothing.
+    def advance_token_ids(self, input_ids, batch_accept_states, parse_start_index=None):
+        if self.last_size is None:
+            prefix_to_parse = [
+                single_input_ids[parse_start_index:]
+                if parse_start_index is not None
+                else []
+                for single_input_ids in input_ids
+            ]
+            # self.grammar_acceptor.accept_token_ids(prefix_to_parse, self.stacks)
+            batch_accept_states = [
+                self._consume_token_ids(prefix, accept_state)
+                for prefix, accept_state in zip(prefix_to_parse, batch_accept_states)
+            ]
+            #  if the length of the current input IDs (input_ids[0]) is exactly one more than self.last_size.
+            #  This is expected in a scenario where inputs are processed incrementally, one token at a time.
+        elif len(input_ids[0]) == self.last_size + 1:
+            batch_accept_states = [
+                self._consume_token_id(single_input_ids[-1], accept_state)
+                for single_input_ids, accept_state in zip(
+                    input_ids, batch_accept_states
+                )
+            ]
+            #  ensure that the input size is consistent with the expected incremental processing
+            #  (i.e., one token at a time).
+        else:
+            # here we check if the input_ids are one token longer than the last time we processed
+            # but we don't check if input_ids are actually valid.
+            # Imagine a scenario where we generate 10 tokens, then we replace the 10 generated tokens with 10 new tokens.
+            # In this case, the input_ids will be consistent with the last_size, but the input_ids are not valid.
+            # However, should we really check if the input_ids are valid here?
+            # If we do, then we need to reparse the whole input_ids at each call, which is not efficient.
+            # Maybe we should just trust the user to provide valid input_ids?
+            # The conclusion is that, we assume the input_ids are valid, and our generation will be correct.
+            # If the input_ids are not valid, then the generation result will be wrong and we don't take responsibility for that.
+            raise RuntimeError(
+                "Input ID's length is inconsistent with the current state of "
+                "the GrammarConstrainedLogitsProcessor. If you want to process "
+                "another input sequence, please instantiate a new "
+                "GrammarConstrainedLogitsProcessor "
+                "or call reset_parser method of GrammarAlignedOracleLogitsProcessor"
+            )
+        self.last_size = len(input_ids[0])
+        return batch_accept_states
+    def _consume_token_ids(
+        self, token_ids: List[int], accept_state: AcceptState = None, as_string=True
+    ):
+        if accept_state is None:
+            accept_state = self.string_recognizer.get_initial_accept_state()
+        if as_string:
+            string = self.tokenizer.decode(token_ids)
+            accept_state = self.string_recognizer._consume_string(string, accept_state)
+        else:
+            for i, token_id in enumerate(token_ids):
+                accept_state = self._consume_token_id(token_id, accept_state)
+                if len(accept_state.stacks) > 0:
+                    cur_token_ids = token_ids[: i + 1]
+                    logging.debug(f"{cur_token_ids} is accepted")
+                    decoded_string = self.tokenizer.decode(cur_token_ids)
+                    logging.debug(f"The decoded string is {decoded_string}")
+        return accept_state
+    def reset(self):
+        self.last_size = None
+def check_token_acceptance_in_trie(trie, stacks, grammar, eos_token_id, accepts):
+    for byte, next_trie in trie.items():
+        if byte == LEAF:
+            token_id = next_trie
+            if token_id != eos_token_id:
+                # if the stacks is not empty, it means we can still continue to parse
+                # so we should accept the token
+                accepts[token_id] = bool(stacks)
+            continue
+        new_stacks = []
+        for stk in stacks:
+            if not stk:
+                continue
+            next_element_offset = stk[-1]
+            num_chars = grammar.grammar_encoding[next_element_offset]
+            if not grammar.char_acceptance_at_element(next_element_offset).get(
+                byte, False
+            ):
+                # if the current byte is not accepted by the current rule, we need to try next rule
+                continue
+            next_element_offset += num_chars + 1
+            new_stack = stk[:-1]
+            if grammar.grammar_encoding[next_element_offset]:
+                new_stack.append(next_element_offset)
+            new_stacks.extend(grammar.advance_stack(tuple(new_stack)))
+        if new_stacks:
+            check_token_acceptance_in_trie(
+                next_trie, new_stacks, grammar, eos_token_id, accepts
+            )
+    return accepts
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    with open("examples/grammars/japanese.ebnf", "r") as file:
+        input_text = file.read()
+    parsed_grammar = parse_ebnf(input_text)
+    parsed_grammar.print()
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenRecognizer = IncrementalTokenRecognizer(
+        grammar_str=input_text, start_rule_name="root", tokenizer=tokenizer
+    )
+    japanese = "トリーム"  # "こんにちは"
+    token_ids = tokenizer.encode(japanese)
+    # 13298, 12675, 12045, 254
+    stacks = tokenRecognizer._consume_token_ids(
+        token_ids, tokenRecognizer.string_recognizer.stacks, as_string=False
+    )
+    if stacks:
+        print("The Japanese input is accepted")
+    else:
+        print("The Japanese input is not accepted")
+    korean = "안녕하세요"
+    token_ids = tokenizer.encode(korean)
+    try:
+        stacks = tokenRecognizer._consume_token_ids(
+            token_ids, tokenRecognizer.string_recognizer.stacks, as_string=False
+        )
+        if stacks:
+            print("The Korean input is accepted")
+        else:
+            print("The Korean input is not accepted")
+    except ValueError as e:
+        print("The Korean input is not accepted")

transformers_gad/trie.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import logging
+from functools import lru_cache
+from typing import Dict, List, Tuple
+from collections import deque
+from transformers_gad.mapping import get_mapping
+logger = logging.getLogger(__name__)
+class TrieNode:
+    def __init__(self):
+        self.children = {}
+        self.is_end_of_word = False
+        self.token_id = None
+class ByteTrie:
+    def __init__(self):
+        self.root = TrieNode()
+    def insert(self, word, token_id=None):
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = TrieNode()
+            node = node.children[char]
+        node.is_end_of_word = True
+        node.token_id = token_id
+    def search(self, word):
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                return False
+            node = node.children[char]
+        return node.is_end_of_word
+    def start_with_prefix(self, prefix):
+        node = self.root
+        for char in prefix:
+            if char not in node.children:
+                return False
+            node = node.children[char]
+        return True
+    @classmethod
+    def from_tokenizer(cls, tokenizer, unicode=True):
+        vocab: Dict[str, int] = tokenizer.get_vocab()
+        trie = cls()
+        mapping = get_mapping(tokenizer, unicode=unicode)
+        for token_id in vocab.values():
+            byte_repr = mapping.map(token_id)
+            trie.insert(byte_repr, token_id)
+        return trie
+    @lru_cache(maxsize=128)
+    def __len__(self):
+        return len(self.dfs(verbose=False))
+    def dfs(self, accept=lambda x: True, verbose=False) -> List[Tuple[List[int], int]]:
+        result = []
+        counter = {"visited": 0, "pruned": 0}
+        _dfs(self.root, [], result, accept, counter)
+        return result
+    def bfs(
+        self, predicate=lambda x: True, verbose=False
+    ) -> List[Tuple[List[int], int]]:
+        queue = deque([(self.root, [])])
+        valid_byte_seqs: List[Tuple[List[int], int]] = []
+        counter = {"visited": 0, "pruned": 0}
+        while queue:
+            counter["visited"] += 1
+            node, byte_seq = queue.popleft()
+            if predicate(byte_seq):
+                if node.is_end_of_word:
+                    valid_byte_seqs.append((byte_seq, node.token_id))
+                for char, next_node in node.children.items():
+                    new_byte_seq: List[int] = byte_seq.copy()
+                    new_byte_seq.append(char)
+                    queue.append((next_node, new_byte_seq))
+            else:
+                counter["pruned"] += 1
+        return valid_byte_seqs
+    def get_token_acceptance(
+        self, accept=lambda x: True, accept_eos=True, eos_token_id=None
+    ) -> List[bool]:
+        valid_byte_seqs: List[Tuple[List[int], int]] = self.bfs(accept, verbose=True)
+        valid_token_ids: List[int] = [token_id for _, token_id in valid_byte_seqs]
+        token_acceptance: List[bool] = [False] * (len(self))
+        for token_id in valid_token_ids:
+            token_acceptance[token_id] = True
+        if not accept_eos:
+            # eos_token is mapped to an empty string, so it's always accepted regardless of the accept function
+            # this can be undesirable, so we can set it to False to ignore it
+            token_acceptance[eos_token_id] = False
+        return token_acceptance
+def _dfs(
+    node,
+    cur_byte_seq: List[int],
+    result: List[Tuple[List[int], int]],
+    accept: callable,
+    counter: Dict[str, int],
+):
+    counter["visited"] += 1
+    if accept(cur_byte_seq):
+        if node.is_end_of_word:
+            result.append((cur_byte_seq, node.token_id))
+        for char, next_node in node.children.items():
+            new_byte_seq: List[int] = cur_byte_seq.copy()
+            new_byte_seq.append(char)
+            _dfs(next_node, new_byte_seq, result, accept, counter)
+    else:
+        # Skip the entire subtree if the predict function returns False
+        counter["pruned"] += 1
+        return
+def starts_with_prefix(prefix, target):
+    """
+    Check if the given prefix is a valid start of the target word or if the target word is a valid start of the given prefix.
+    Args:
+    prefix (str): The string prefix to be checked.
+    target (str): The target word to compare the prefix against.
+    Returns:
+    bool: True if prefix is a valid start of target or if target is a valid start of prefix, False otherwise.
+    """
+    # Check if the target word starts with the given prefix.
+    # This covers the case where the prefix is shorter than the target word.
+    if target.startswith(prefix):
+        return True
+    # Check if the given prefix starts with the target word.
+    # This covers the case where the prefix is longer than or equal to the target word.
+    if prefix.startswith(target):
+        return True
+    # If neither of the above conditions are true, return False.
+    return False
+if __name__ == "__main__":
+    import logging
+    # Configure logging
+    logging.basicConfig(level=logging.INFO)
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("gpt2", fast=True)
+    trie = ByteTrie.from_tokenizer(tokenizer, unicode=True)
+    print(f"length of trie: {len(trie)}=={len(tokenizer.vocab.items())}")
+    #
+    # print(trie.search("hello"))  # Example, replace with actual words from the vocab
+    # print(trie.start_with_prefix("hell"))
+    #
+    # # Example Usage
+    # words = trie.dfs(accept=lambda x: len(x) > 0 and x[0] == 65 or len(x)==0)
+    # for word in words:
+    #     print(bytes(word[0]).decode("utf-8"))
+    #
+    # # Example Usage
+    # words = trie.bfs(predicate=lambda x: len(x) > 0 and x[0] == 65 or len(x)==0)
+    # for word in words:
+    #     print(bytes(word[0]).decode("utf-8"))
+    #
+    # token_acceptance = trie.get_token_acceptance(accept=lambda x: len(x) > 0 and x[0] == 65 or len(x)==0)
+    # print(sum(token_acceptance))
+    # assert sum(token_acceptance) == len(words)
+    ########################
+    # UTF-8
+    ########################
+    # from transformers import AutoTokenizer
+    #
+    # japanese = "こんにちは世界"
+    # with open("examples/grammars/japanese.ebnf", "r") as file:
+    #     input_text = file.read()
+    # parsed_grammar = parse_ebnf(input_text)
+    #
+    # start_rule_id = parsed_grammar.symbol_table["root"]
+    #
+    # recognizer = GrammarRecognizer(parsed_grammar.grammar_encoding, start_rule_id)
+    # accept_state = recognizer.init_accept_state()
+    # token_acc = trie.get_token_acceptance(accept=lambda x: recognizer._probe_bytes_partial_match(x, accept_state=accept_state))

transformers_gad/utf8_utils.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from dataclasses import dataclass
+from typing import Tuple
+from dataclasses import dataclass
+@dataclass
+class PartialUTF8:
+    """
+    A data class representing the state of a partially decoded UTF-8 sequence.
+    Attributes:
+    - value (int): The current accumulated value of the partially decoded Unicode code point.
+                   This attribute stores the bits that have been decoded so far. For a fully decoded
+                   character or before any partial decoding has started, this would typically be `0`.
+    - n_remain (int): The number of bytes remaining to complete the current UTF-8 encoded character.
+                      A value of `-1` indicates that there is no ongoing partial decoding, i.e.,
+                      either decoding has not started, or the last character was fully decoded.
+    This class is used to handle situations where UTF-8 encoded data may end in the middle of a character
+    sequence, allowing for the decoding process to be resumed when more data becomes available.
+    """
+    value: int = 0  # Default to 0, indicating no partial value accumulated
+    n_remain: int = (
+        -1
+    )  # Default to -1, indicating no bytes are currently expected to complete the character
+    def __hash__(self):
+        return hash((self.value, self.n_remain))
+    def __eq__(self, other):
+        if not isinstance(other, PartialUTF8):
+            return NotImplemented
+        return self.value == other.value and self.n_remain == other.n_remain
+from typing import List, Tuple
+from functools import lru_cache
+@lru_cache(maxsize=3000000)
+def decode_utf8(
+    src: bytes, partial_start: PartialUTF8
+) -> Tuple[List[int], PartialUTF8]:
+    # Lookup table for determining the total bytes based on the first byte's high 4 bits
+    lookup = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4]
+    pos = 0  # Position in the src bytes to start decoding from
+    code_points = []  # List to store the decoded Unicode code points
+    value = partial_start.value  # Start with any previously partial decoded value
+    n_remain = partial_start.n_remain  # Number of bytes remaining from a partial decode
+    # If there's a partial sequence left from last decode, try to continue decoding it
+    while pos < len(src) and n_remain > 0:
+        next_byte = src[pos]  # Get the next byte to process
+        # Check if the continuation byte format is correct (`10xxxxxx`)
+        if (next_byte >> 6) != 2:
+            # If not, it's an invalid sequence. Abort and return a special error state.
+            code_points = [0]
+            return code_points, PartialUTF8(0, -1)
+        # Accumulate the value by shifting left and adding the relevant 6 bits
+        value = (value << 6) + (next_byte & 0x3F)
+        pos += 1  # Move to the next byte
+        n_remain -= 1  # Decrement the number of remaining bytes
+    # If we've completed a partial sequence, add its value to the code points
+    if partial_start.n_remain > 0 and n_remain == 0:
+        code_points.append(value)
+    # Process the rest of src as complete or new UTF-8 sequences
+    while pos < len(src):
+        first_byte = src[pos]  # Get the first byte of the next sequence
+        highbits = first_byte >> 4  # Extract the high 4 bits for the lookup table
+        n_remain = lookup[highbits] - 1  # Determine remaining bytes in this sequence
+        # If lookup returns an invalid number, it's an invalid sequence. Abort.
+        if n_remain < 0:
+            # raise ValueError("Invalid UTF-8 sequence")
+            code_points = [0]
+            return code_points, PartialUTF8(0, -1)
+        # Calculate the mask to isolate significant bits from the first byte
+        mask = (1 << (7 - n_remain)) - 1
+        value = first_byte & mask  # Apply the mask to get the initial value
+        pos += 1  # Move to the next byte
+        # Process the continuation bytes
+        while pos < len(src) and n_remain > 0:
+            next_byte = src[pos]
+            # Shift the accumulated value and add the next 6 significant bits
+            value = (value << 6) + (next_byte & 0x3F)
+            pos += 1  # Move to the next byte
+            n_remain -= 1  # Decrement the number of remaining bytes
+        # If the sequence is complete, add its decoded value to the code points
+        if n_remain == 0:
+            code_points.append(value)
+    # # Append a terminating value to indicate the end (following llama-cpp implementation)
+    # code_points.append(0)
+    # the following line is crucial for LRU cache to work, as it reset to the initial state
+    if n_remain == 0:
+        n_remain = -1
+        value = 0
+    # Return the decoded code points and the state of any partial decoding
+    return code_points, PartialUTF8(value, n_remain)
+def decode_utf8_leading_char(src: bytes) -> tuple:
+    first_byte = src[0]
+    highbits = first_byte >> 4
+    lookup = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4]
+    char_len = lookup[highbits]
+    # Extract the relevant bytes for the UTF-8 character
+    utf8_char_bytes = src[:char_len]
+    # Decode the character
+    char = utf8_char_bytes.decode("utf-8")
+    # Use ord() to convert the single character to its Unicode code point
+    code_point = ord(char)
+    # Remaining bytes
+    remaining_bytes = src[char_len:]
+    return code_point, remaining_bytes
+def decode_utf8_string(utf8_bytes: bytes) -> list:
+    code_points = []
+    while utf8_bytes:
+        code_point, utf8_bytes = decode_utf8_leading_char(utf8_bytes)
+        code_points.append(code_point)
+    return code_points
+if __name__ == "__main__":
+    # Given string
+    my_string = "€Hello"  # The Euro symbol followed by "Hello"
+    # Get UTF-8 encoded bytes
+    utf8_bytes = my_string.encode("utf-8")
+    assert utf8_bytes == b"\xe2\x82\xacHello"
+    # Example usage with the Euro symbol followed by more characters
+    code_point, remaining_bytes = decode_utf8_leading_char(utf8_bytes)
+    print(f"Code Point: {code_point}")  # Expected Output: 8364 (Euro symbol)
+    print(f"Remaining Bytes: {remaining_bytes}")  # Expected Output: b'Hello'
+    # Example usage with the entire string
+    code_points = decode_utf8_string(utf8_bytes)
+    print(
+        f"Code Points: {code_points}"
+    )  # Expected Output: [8364, 72, 101, 108, 108, 111]
+    print("-" * 50)
+    # Example usage:
+    utf8_bytes = b"\xe2\x82\xacHello"  # UTF-8 encoded string (Euro symbol + "Hello")
+    partial_start = PartialUTF8()  # Assuming start with no partial sequence
+    code_points, partial_utf8 = decode_utf8(utf8_bytes, partial_start)
+    print("Code Points:", code_points)
+    print("Remaining UTF-8 State:", partial_utf8.value, partial_utf8.n_remain)

transformers_gad/utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import json
+import warnings
+from typing import List
+from termcolor import colored
+def ints2bytes(sequence: List[int]) -> bytes:
+    # check in the range of 0-255
+    for item in sequence:
+        if not 0 <= item <= 255:
+            raise ValueError(f"item: {item} is not in the range [0, 255]")
+    return bytes(sequence)
+def bytes2ints(byte_sequence: bytes) -> List[int]:
+    return list(byte_sequence)
+def intervals_intersect(low1, high1, low2, high2):
+    """
+    Check if two intervals [low1, high1] and [low2, high2] intersect.
+    :param high1: High bound of the first interval.
+    :param low1: Low bound of the first interval.
+    :param high2: High bound of the second interval.
+    :param low2: Low bound of the second interval.
+    :return: True if the intervals intersect, False otherwise.
+    """
+    # Check if one interval is completely to the right of the other
+    if low1 > high2 or low2 > high1:
+        return False
+    # If the above condition is not met, the intervals intersect
+    return True
+def pprint_token_ids(tokenizer, token_ids=None, text=None):
+    if token_ids is None and text is None:
+        raise ValueError("Either token_ids or text should be provided")
+    if token_ids is None:
+        token_ids = tokenizer.encode(text, add_special_tokens=False)
+    special_token_ids = tokenizer.all_special_ids
+    special_tokens = tokenizer.all_special_tokens
+    special_id2token = {
+        id: token for id, token in zip(special_token_ids, special_tokens)
+    }
+    # loop over token_ids and color the special tokens
+    colored_token_ids = []
+    for token_id in token_ids:
+        if token_id in special_id2token:
+            colored_token_ids.append(colored(token_id, "red", attrs=["bold"]))
+        else:
+            colored_token_ids.append(str(token_id))
+    colored_token_ids_str = [str(item) for item in colored_token_ids]
+    print("[" + ", ".join(colored_token_ids_str) + "]")
+def get_tokenizer_model_type(model: str = "gpt2"):
+    """
+    reference https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_fast.py#L729
+    :param model:
+    :return: BPE, Unigram, WordPiece, WordLevel
+    SentencePiece is used in conjunction with Unigram
+    """
+    from transformers import AutoTokenizer
+    # if the tokenizer is not in the repo, it will raise OSError
+    # OSError: Can't load tokenizer for 'xxx'
+    # This happens when the model reuses the tokenizer of another model
+    if type(model) == str:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+            # check if the tokenizer is fast
+        except OSError:
+            return None
+    else:
+        tokenizer = model
+    if not tokenizer.is_fast:
+        raise ValueError(f"The tokenizer {model} is not fast tokenizer")
+    tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
+    model_type = tokenizer_json["model"]["type"]
+    if (
+        model_type == "BPE"
+        and tokenizer_json["pre_tokenizer"] is not None
+        and (
+            tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
+            or (
+                "pretokenizers" in tokenizer_json["pre_tokenizer"]
+                and tokenizer_json["pre_tokenizer"]["pretokenizers"][1]["type"]
+                == "ByteLevel"
+            )
+        )
+    ):
+        model_type = "ByteLevelBPE"
+    return model_type

transformers_gad/vocab_struct.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#################
+# DATA STRUCTURES
+#################
+import logging
+import re
+logger = logging.getLogger(__name__)
+LEAF = -1
+# TokenTrie is a trie that maps token IDs to their byte representations
+class TokenTrie:
+    def __init__(self, tokenizer):
+        self.eos_token_id = tokenizer.eos_token_id
+        self.tokens = []
+        self.trie = {}
+        self.load_tokens(tokenizer)
+    def id2str(self, token_id):
+        return self.tokens[token_id]
+    def __len__(self):
+        return len(self.tokens)
+    def load_tokens(self, tokenizer):
+        def replace_hex(match):
+            hex_value = match.group(1)
+            return chr(int(hex_value, 16))
+        if "gpt2" in tokenizer.__class__.__name__.lower():
+            special = tokenizer.additional_special_tokens_ids
+            # Here, the decoder does a string replace on a bunch of sequences
+            # like ' .' for '.'. This interferes with our assumptions, where a
+            # token should always have exactly one representation.
+            # Fortunately(?) text-generation-inference doesn't seem to run this
+            # cleanup, so we get extraneous spaces. So, in order to generate
+            # the right token set for TGI, we have to skip the space trimming.
+            # See:
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3588-L3600
+            def fmt_token(id):
+                if id in special:
+                    return None
+                return bytes(
+                    tokenizer.decode([id], clean_up_tokenization_spaces=False), "utf-8"
+                )
+        elif (
+            "llama" in tokenizer.__class__.__name__.lower()
+            or "t5" in tokenizer.__class__.__name__.lower()
+        ):
+            def fmt_token(id):
+                token = tokenizer.convert_ids_to_tokens(id)
+                token = re.sub(r"<0x([0-9a-fA-F]{2})>", replace_hex, token)
+                token = token.replace("▁", " ")
+                return bytes(token, "utf-8") # here return bytes representations of the tokens
+        else:
+            logger.warning(
+                "Warning: unrecognized tokenizer: using default token formatting"
+            )
+            def fmt_token(id):
+                token = tokenizer.convert_ids_to_tokens(id)
+                return bytes(token, "utf-8")
+        # note: vocab_size doesn't work here because there are also
+        # get_added_vocab() tokens
+        self.tokens = [fmt_token(i) for i in range(len(tokenizer.get_vocab()))]
+        for token_id, token_bytes in enumerate(self.tokens):
+            if token_bytes is not None:
+                self.insert_into_trie(self.trie, token_bytes, token_id)
+    def insert_into_trie(self, trie, token_bytes, token_id):
+        current = trie
+        for byte in token_bytes:
+            if byte not in current:
+                current[byte] = {}
+            current = current[byte]
+        current[LEAF] = token_id