Module: Tokenizers::FromPretrained

Included in:
Tokenizer
Defined in:
lib/tokenizers/from_pretrained.rb

Constant Summary collapse

TOKENIZERS_VERSION =

for user agent

"0.21.0"

Instance Method Summary collapse

Instance Method Details

#from_pretrained(identifier, revision: "main", auth_token: nil) ⇒ Object

use Ruby for downloads this avoids the need to vendor OpenSSL on Linux and reduces the extension size by about half



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/tokenizers/from_pretrained.rb', line 9

def from_pretrained(identifier, revision: "main", auth_token: nil)
  require "cgi"
  require "digest"
  require "fileutils"
  require "json"
  require "net/http"
  require "open-uri"

  cache_dir = ensure_cache_dir

  options = {
    open_timeout: 3,
    read_timeout: 30
  }
  headers = {
    "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
  }
  if auth_token
    headers["Authorization"] = "Bearer #{auth_token}"
  end

  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }

  path =
    begin
      cached_path(cache_dir, url, headers, options)
    rescue OpenURI::HTTPError
      raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
    end

  from_file(path)
end