mirror of
https://github.com/finegrain-ai/refiners.git
synced 2024-11-15 01:28:14 +00:00
fix CLIPTokenizer
skipping underscores
This commit is contained in:
parent
f89c4f720d
commit
590648ebca
|
@ -44,7 +44,7 @@ class CLIPTokenizer(fl.Module):
|
||||||
# to get rid of the dependence on the `regex` module. Unicode support could
|
# to get rid of the dependence on the `regex` module. Unicode support could
|
||||||
# potentially be added back by leveraging the `\w` character class.
|
# potentially be added back by leveraging the `\w` character class.
|
||||||
self.token_pattern = re.compile(
|
self.token_pattern = re.compile(
|
||||||
pattern=r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[a-zA-Z]+|[0-9]|[^\s\w]+""",
|
pattern=r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[a-zA-Z]+|[0-9]|(?:[^\s\w]|_)+""",
|
||||||
flags=re.IGNORECASE,
|
flags=re.IGNORECASE,
|
||||||
)
|
)
|
||||||
self.start_of_text_token_id: int = start_of_text_token_id
|
self.start_of_text_token_id: int = start_of_text_token_id
|
||||||
|
|
Loading…
Reference in a new issue