Module: Tokenizers::FromPretrained
- Included in:
- Tokenizer
- Defined in:
- lib/tokenizers/from_pretrained.rb
Constant Summary collapse
- TOKENIZERS_VERSION =
for user agent
"0.21.0"
Instance Method Summary collapse
-
#from_pretrained(identifier, revision: "main", auth_token: nil) ⇒ Object
use Ruby for downloads this avoids the need to vendor OpenSSL on Linux and reduces the extension size by about half.
Instance Method Details
#from_pretrained(identifier, revision: "main", auth_token: nil) ⇒ Object
use Ruby for downloads this avoids the need to vendor OpenSSL on Linux and reduces the extension size by about half
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/tokenizers/from_pretrained.rb', line 9 def from_pretrained(identifier, revision: "main", auth_token: nil) require "cgi" require "digest" require "fileutils" require "json" require "net/http" require "open-uri" cache_dir = ensure_cache_dir = { open_timeout: 3, read_timeout: 30 } headers = { "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}" } if auth_token headers["Authorization"] = "Bearer #{auth_token}" end url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) } path = begin cached_path(cache_dir, url, headers, ) rescue OpenURI::HTTPError raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer" end from_file(path) end |