From 4187419b4c4d45941ab13f7fbe48f9235fdfa61a Mon Sep 17 00:00:00 2001 From: Scott Lundberg Date: Wed, 31 Jan 2024 22:51:10 +0000 Subject: [PATCH] Fix #609 to ensure token consistency --- guidance/_grammar.py | 22 ++++++++++++++----- guidance/models/transformers/_transformers.py | 3 ++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/guidance/_grammar.py b/guidance/_grammar.py index 19a7e0412..dedd3eb26 100644 --- a/guidance/_grammar.py +++ b/guidance/_grammar.py @@ -7,19 +7,29 @@ from . import _serialization_pb2 from . import _parser -tag_start = "{{G|" -tag_end = "|G}}" -_call_pool = {} -_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end)) + +# to support the embedding of guidance functions inside Python f-strings we use tags with these delimiters +tag_start = "{{G|" # start of a call tag +tag_end = "|G}}" # end of a call tag +_call_pool = {} # the functions associated with the call tags +_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end)) # the pattern for matching call tags class StatefulException(Exception): '''This is raised when we try and use the state of a grammar object like it was a live model. - Note that eventually we do want to support stateful parser/grammar constructs directly, but - for now we use a traditional parser and grammar separation (hence the need for this exception).''' + Note that eventually it would be nice to support stateful parser/grammar constructs directly, but + such "parser combinators" cannot be run effciently in Python. So we use a traditional parser and + grammar separation (hence the need for this exception).''' pass class Function(): + ''' This is the abstract class representing all guidance functions. + + There are two main subclasses: GrammarFunction and RawFunction. GrammarFunctions + represent guidance grammars that can be serialized and sent across the wire, while + RawFunctions represent unconstrained native Python functions. + ''' + def __init__(self, name, value=None) -> None: self.name = name self.value = value diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 5d1471df4..0464ed6c2 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -93,7 +93,8 @@ def _model_and_tokenizer(self, model, tokenizer, **kwargs): return model, tokenizer def _joint_tokenize(self, token_ids): - first_decode = self.tokenizer._orig_tokenizer.decode(token_ids) + # first_decode = self.tokenizer._orig_tokenizer.decode(token_ids) + first_decode = b''.join([self.tokenizer.tokens[id] for id in token_ids]).decode("utf8") new_ids = self.tokenizer._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"] # HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle)