fix CLIPTokenizer skipping underscores

2024-11-21 05:38:46 +00:00 · 2024-10-01 14:02:03 -04:00 · 2024-10-01 14:02:03 -04:00 · 590648ebca
parent f89c4f720d
commit 590648ebca
1 changed files with 1 additions and 1 deletions
--- a/src/refiners/foundationals/clip/tokenizer.py
+++ b/src/refiners/foundationals/clip/tokenizer.py
@ -44,7 +44,7 @@ class CLIPTokenizer(fl.Module):
        # to get rid of the dependence on the `regex` module. Unicode support could
        # potentially be added back by leveraging the `\w` character class.
        self.token_pattern = re.compile(
-            pattern=r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[a-zA-Z]+|[0-9]|[^\s\w]+""",
+            pattern=r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[a-zA-Z]+|[0-9]|(?:[^\s\w]|_)+""",
            flags=re.IGNORECASE,
        )
        self.start_of_text_token_id: int = start_of_text_token_id