Module: Transformers::HfHub
- Defined in:
- lib/transformers/hf_hub/errors.rb,
lib/transformers/hf_hub/constants.rb,
lib/transformers/hf_hub/file_download.rb,
lib/transformers/hf_hub/utils/_errors.rb,
lib/transformers/hf_hub/utils/_headers.rb
Defined Under Namespace
Classes: BadRequestError, DisabledRepoError, EntryNotFoundError, Error, GatedRepoError, HfFileMetadata, HfHubHTTPError, LocalEntryNotFoundError, LocalTokenNotFoundError, OfflineModeIsEnabled, RepositoryNotFoundError, RevisionNotFoundError
Constant Summary collapse
- ENV_VARS_TRUE_VALUES =
Possible values for env variables
["1", "ON", "YES", "TRUE"]
- ENV_VARS_TRUE_AND_AUTO_VALUES =
ENV_VARS_TRUE_VALUES + ["AUTO"]
- DEFAULT_ETAG_TIMEOUT =
Constants for file downloads
10
- DEFAULT_REVISION =
Git-related constants
"main"
- ENDPOINT =
ENV["HF_ENDPOINT"] || "https://huggingface.co"
- HUGGINGFACE_CO_URL_TEMPLATE =
ENDPOINT + "/%{repo_id}/resolve/%{revision}/%{filename}"
- HUGGINGFACE_HEADER_X_REPO_COMMIT =
"x-repo-commit"
- HUGGINGFACE_HEADER_X_LINKED_ETAG =
"x-linked-etag"
- HUGGINGFACE_HEADER_X_LINKED_SIZE =
"x-linked-size"
- REPO_ID_SEPARATOR =
"--"
- REPO_TYPE_DATASET =
^ this substring is not allowed in repo_ids on hf.co and is the canonical one we use for serialization of repo ids elsewhere.
"dataset"
- REPO_TYPE_SPACE =
"space"
- REPO_TYPE_MODEL =
"model"
- REPO_TYPES =
[nil, REPO_TYPE_MODEL, REPO_TYPE_DATASET, REPO_TYPE_SPACE]
- REPO_TYPES_URL_PREFIXES =
{ REPO_TYPE_DATASET => "datasets/", REPO_TYPE_SPACE => "spaces/" }
- DEFAULT_HOME =
default cache
File.join(ENV.fetch("HOME"), ".cache")
- HF_HOME =
File.( ENV.fetch( "HF_HOME", File.join(ENV.fetch("XDG_CACHE_HOME", DEFAULT_HOME), "huggingface") ) )
- HF_HUB_CACHE =
New env variables
ENV["HF_HUB_CACHE"] || File.join(HF_HOME, "hub")
- HF_HUB_OFFLINE =
_is_true(ENV["HF_HUB_OFFLINE"] || ENV["TRANSFORMERS_OFFLINE"])
- HF_HUB_DISABLE_IMPLICIT_TOKEN =
Disable sending the cached token by default is all HTTP requests to the Hub
_is_true(ENV["HF_HUB_DISABLE_IMPLICIT_TOKEN"])
- HF_HUB_ENABLE_HF_TRANSFER =
_is_true(ENV["HF_HUB_ENABLE_HF_TRANSFER"])
- CACHED_NO_EXIST =
Return value when trying to load a file from cache but the file does not exist in the distant repo.
Object.new
- HEADER_FILENAME_PATTERN =
Regex to get filename from a “Content-Disposition” header for CDN-served files
/filename="(?<filename>.*?)";/
- REGEX_COMMIT_HASH =
Regex to check if the revision IS directly a commit_hash
/^[0-9a-f]{40}$/
Class Method Summary collapse
- ._as_int(value) ⇒ Object
- ._cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) ⇒ Object
- ._check_disk_space(expected_size, target_dir) ⇒ Object
- ._chmod_and_move(src, dst) ⇒ Object
- ._create_symlink(src, dst, new_blob: false) ⇒ Object
- ._download_to_tmp_and_move(incomplete_path:, destination_path:, url_to_download:, proxies:, headers:, expected_size:, filename:, force_download:) ⇒ Object
- ._get_metadata_or_catch_error(repo_id:, filename:, repo_type:, revision:, endpoint:, proxies:, etag_timeout:, headers:, token:, local_files_only:, relative_filename: nil, storage_folder: nil) ⇒ Object
- ._get_pointer_path(storage_folder, revision, relative_filename) ⇒ Object
- ._hf_hub_download_to_cache_dir(cache_dir:, repo_id:, filename:, repo_type:, revision:, endpoint:, etag_timeout:, headers:, proxies:, token:, local_files_only:, force_download:) ⇒ Object
- ._http_user_agent(library_name: nil, library_version: nil, user_agent: nil) ⇒ Object
- ._int_or_none(value) ⇒ Object
- ._is_true(value) ⇒ Object
- ._normalize_etag(etag) ⇒ Object
- ._raise_on_head_call_error(head_call_error, force_download, local_files_only) ⇒ Object
- ._request_wrapper(method, url, follow_relative_redirects: false, redirects: 0, **params) ⇒ Object
- ._validate_token_to_send(token, is_write_action) ⇒ Object
- .build_hf_headers(token: nil, is_write_action: false, library_name: nil, library_version: nil, user_agent: nil, headers: nil) ⇒ Object
- .display_progress(filename, width, size, expected_size) ⇒ Object
- .get_hf_file_metadata(url, token: nil, proxies: nil, timeout: DEFAULT_REQUEST_TIMEOUT, library_name: nil, library_version: nil, user_agent: nil, headers: nil) ⇒ Object
- .get_token_to_send(token) ⇒ Object
- .hf_hub_download(repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, library_name: nil, library_version: nil, cache_dir: nil, local_dir: nil, local_dir_use_symlinks: "auto", user_agent: nil, force_download: false, force_filename: nil, proxies: nil, etag_timeout: DEFAULT_ETAG_TIMEOUT, resume_download: false, token: nil, local_files_only: false, legacy_cache_layout: false, endpoint: nil) ⇒ Object
- .hf_hub_url(repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, endpoint: nil) ⇒ Object
- .hf_raise_for_status(response, endpoint_name: nil) ⇒ Object
- .http_get(url, temp_file, proxies: nil, resume_size: 0, headers: nil, expected_size: nil, displayed_filename: nil, _nb_retries: 5) ⇒ Object
-
.netloc(uri) ⇒ Object
additional methods.
- .parents(path) ⇒ Object
- .repo_folder_name(repo_id:, repo_type:) ⇒ Object
- .try_to_load_from_cache(repo_id, filename, cache_dir: nil, revision: nil, repo_type: nil) ⇒ Object
Class Method Details
._as_int(value) ⇒ Object
15 16 17 18 19 20 |
# File 'lib/transformers/hf_hub/constants.rb', line 15 def self._as_int(value) if value.nil? return nil end value.to_i end |
._cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) ⇒ Object
194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/transformers/hf_hub/file_download.rb', line 194 def _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) if revision != commit_hash ref_path = Pathname.new(storage_folder) / "refs" / revision ref_path.parent.mkpath if !ref_path.exist? || commit_hash != ref_path.read # Update ref only if has been updated. Could cause useless error in case # repo is already cached and user doesn't have write access to cache folder. # See https://github.com/huggingface/huggingface_hub/issues/1216. ref_path.write(commit_hash) end end end |
._check_disk_space(expected_size, target_dir) ⇒ Object
213 214 215 |
# File 'lib/transformers/hf_hub/file_download.rb', line 213 def _check_disk_space(expected_size, target_dir) # TODO end |
._chmod_and_move(src, dst) ⇒ Object
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 |
# File 'lib/transformers/hf_hub/file_download.rb', line 707 def _chmod_and_move(src, dst) tmp_file = dst.parent.parent / "tmp_#{SecureRandom.uuid}" begin FileUtils.touch(tmp_file) cache_dir_mode = Pathname.new(tmp_file).stat.mode src.chmod(cache_dir_mode) ensure begin tmp_file.unlink rescue Errno::ENOENT # fails if `tmp_file.touch()` failed => do nothing # See https://github.com/huggingface/huggingface_hub/issues/2359 end end FileUtils.move(src.to_s, dst.to_s) end |
._create_symlink(src, dst, new_blob: false) ⇒ Object
180 181 182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/transformers/hf_hub/file_download.rb', line 180 def _create_symlink(src, dst, new_blob: false) begin FileUtils.rm(dst) rescue Errno::ENOENT # do nothing end # abs_src = File.absolute_path(File.expand_path(src)) # abs_dst = File.absolute_path(File.expand_path(dst)) # abs_dst_folder = File.dirname(abs_dst) FileUtils.symlink(src, dst) end |
._download_to_tmp_and_move(incomplete_path:, destination_path:, url_to_download:, proxies:, headers:, expected_size:, filename:, force_download:) ⇒ Object
645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 |
# File 'lib/transformers/hf_hub/file_download.rb', line 645 def _download_to_tmp_and_move( incomplete_path:, destination_path:, url_to_download:, proxies:, headers:, expected_size:, filename:, force_download: ) if destination_path.exist? && !force_download # Do nothing if already exists (except if force_download=True) return end if incomplete_path.exist? && (force_download || (HF_HUB_ENABLE_HF_TRANSFER && !proxies)) # By default, we will try to resume the download if possible. # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should # not resume the download => delete the incomplete file. = "Removing incomplete file '#{incomplete_path}'" if force_download += " (force_download: true)" elsif HF_HUB_ENABLE_HF_TRANSFER && !proxies += " (hf_transfer: true)" end Transformers.logger.info() incomplete_path.unlink #(missing_ok=True) end incomplete_path.open("ab") do |f| f.seek(0, IO::SEEK_END) resume_size = f.tell = "Downloading '#{filename}' to '#{incomplete_path}'" if resume_size > 0 && !expected_size.nil? += " (resume from #{resume_size}/#{expected_size})" end Transformers.logger.info() if !expected_size.nil? # might be None if HTTP header not set correctly # Check disk space in both tmp and destination path _check_disk_space(expected_size, incomplete_path.parent) _check_disk_space(expected_size, destination_path.parent) end http_get( url_to_download, f, proxies: proxies, resume_size: resume_size, headers: headers, expected_size: expected_size, ) end Transformers.logger.info("Download complete. Moving file to #{destination_path}") _chmod_and_move(incomplete_path, destination_path) end |
._get_metadata_or_catch_error(repo_id:, filename:, repo_type:, revision:, endpoint:, proxies:, etag_timeout:, headers:, token:, local_files_only:, relative_filename: nil, storage_folder: nil) ⇒ Object
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 |
# File 'lib/transformers/hf_hub/file_download.rb', line 530 def ( repo_id:, filename:, repo_type:, revision:, endpoint:, proxies:, etag_timeout:, headers:, # mutated inplace! token:, local_files_only:, relative_filename: nil, # only used to store `.no_exists` in cache storage_folder: nil # only used to store `.no_exists` in cache ) if local_files_only return [ nil, nil, nil, nil, OfflineModeIsEnabled.new( "Cannot access file since 'local_files_only: true' as been set. (repo_id: #{repo_id}, repo_type: #{repo_type}, revision: #{revision}, filename: #{filename})" ) ] end url = hf_hub_url(repo_id, filename, repo_type: repo_type, revision: revision, endpoint: endpoint) url_to_download = url etag = nil commit_hash = nil expected_size = nil head_error_call = nil if !local_files_only = nil begin = ( url, proxies: proxies, timeout: etag_timeout, headers: headers, token: token ) rescue => e raise e raise Todo end # Commit hash must exist commit_hash = .commit_hash if commit_hash.nil? raise Todo end # Etag must exist etag = .etag if etag.nil? raise Todo end # Expected (uncompressed) size expected_size = .size if expected_size.nil? raise Todo end if .location != url url_to_download = .location if netloc(URI.parse(url)) != netloc(URI.parse(.location)) # Remove authorization header when downloading a LFS blob headers.delete("authorization") end end end if !(local_files_only || !etag.nil? || !head_call_error.nil?) raise "etag is empty due to uncovered problems" end [url_to_download, etag, commit_hash, expected_size, head_error_call] end |
._get_pointer_path(storage_folder, revision, relative_filename) ⇒ Object
725 726 727 728 729 730 731 732 733 734 735 |
# File 'lib/transformers/hf_hub/file_download.rb', line 725 def _get_pointer_path(storage_folder, revision, relative_filename) snapshot_path = File.join(storage_folder, "snapshots") pointer_path = File.join(snapshot_path, revision, relative_filename) if !parents(Pathname.new(File.absolute_path(pointer_path))).include?(Pathname.new(File.absolute_path(snapshot_path))) raise ArgumentError, "Invalid pointer path: cannot create pointer path in snapshot folder if" + " `storage_folder: #{storage_folder.inspect}`, `revision: #{revision.inspect}` and" + " `relative_filename: #{relative_filename.inspect}`." end pointer_path end |
._hf_hub_download_to_cache_dir(cache_dir:, repo_id:, filename:, repo_type:, revision:, endpoint:, etag_timeout:, headers:, proxies:, token:, local_files_only:, force_download:) ⇒ Object
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 |
# File 'lib/transformers/hf_hub/file_download.rb', line 293 def _hf_hub_download_to_cache_dir( cache_dir:, # File info repo_id:, filename:, repo_type:, revision:, # HTTP info endpoint:, etag_timeout:, headers:, proxies:, token:, # Additional options local_files_only:, force_download: ) _locks_dir = File.join(cache_dir, ".locks") storage_folder = File.join(cache_dir, repo_folder_name(repo_id: repo_id, repo_type: repo_type)) # cross platform transcription of filename, to be used as a local file path. relative_filename = File.join(*filename.split("/")) # if user provides a commit_hash and they already have the file on disk, shortcut everything. if REGEX_COMMIT_HASH.match?(revision) pointer_path = _get_pointer_path(storage_folder, revision, relative_filename) if File.exist?(pointer_path) && !force_download return pointer_path end end # Try to get metadata (etag, commit_hash, url, size) from the server. # If we can't, a HEAD request error is returned. url_to_download, etag, commit_hash, expected_size, head_call_error = ( repo_id: repo_id, filename: filename, repo_type: repo_type, revision: revision, endpoint: endpoint, proxies: proxies, etag_timeout: etag_timeout, headers: headers, token: token, local_files_only: local_files_only, storage_folder: storage_folder, relative_filename: relative_filename ) # etag can be None for several reasons: # 1. we passed local_files_only. # 2. we don't have a connection # 3. Hub is down (HTTP 500 or 504) # 4. repo is not found -for example private or gated- and invalid/missing token sent # 5. Hub is blocked by a firewall or proxy is not set correctly. # => Try to get the last downloaded one from the specified revision. # # If the specified revision is a commit hash, look inside "snapshots". # If the specified revision is a branch or tag, look inside "refs". if !head_call_error.nil? # Couldn't make a HEAD call => let's try to find a local file if !force_download commit_hash = nil if REGEX_COMMIT_HASH.match(revision) commit_hash = revision else ref_path = File.join(storage_folder, "refs", revision) if File.exist?(ref_path) commit_hash = File.read(ref_path) end end # Return pointer file if exists if !commit_hash.nil? pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) if File.exist?(pointer_path) && !force_download return pointer_path end end end # Otherwise, raise appropriate error _raise_on_head_call_error(head_call_error, force_download, local_files_only) end # From now on, etag and commit_hash are not None. raise "etag must have been retrieved from server" if etag.nil? raise "commit_hash must have been retrieved from server" if commit_hash.nil? raise "file location must have been retrieved from server" if url_to_download.nil? raise "expected_size must have been retrieved from server" if expected_size.nil? blob_path = File.join(storage_folder, "blobs", etag) pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) FileUtils.mkdir_p(File.dirname(blob_path)) FileUtils.mkdir_p(File.dirname(pointer_path)) # if passed revision is not identical to commit_hash # then revision has to be a branch name or tag name. # In that case store a ref. _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) if !force_download if File.exist?(pointer_path) return pointer_path end if File.exist?(blob_path) # we have the blob already, but not the pointer _create_symlink(blob_path, pointer_path, new_blob: false) return pointer_path end end # Prevent parallel downloads of the same file with a lock. # etag could be duplicated across repos, # lock_path = File.join(locks_dir, repo_folder_name(repo_id: repo_id, repo_type: repo_type), "#{etag}.lock") _download_to_tmp_and_move( incomplete_path: Pathname.new(blob_path + ".incomplete"), destination_path: Pathname.new(blob_path), url_to_download: url_to_download, proxies: proxies, headers: headers, expected_size: expected_size, filename: filename, force_download: force_download ) _create_symlink(blob_path, pointer_path, new_blob: true) pointer_path end |
._http_user_agent(library_name: nil, library_version: nil, user_agent: nil) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/transformers/hf_hub/utils/_headers.rb', line 94 def _http_user_agent( library_name: nil, library_version: nil, user_agent: nil ) if !library_name.nil? ua = "#{library_name}/#{library_version}" else ua = "unknown/None" end ua += "; ruby/#{RUBY_VERSION.to_f}" ua end |
._int_or_none(value) ⇒ Object
703 704 705 |
# File 'lib/transformers/hf_hub/file_download.rb', line 703 def _int_or_none(value) value&.to_i end |
._is_true(value) ⇒ Object
8 9 10 11 12 13 |
# File 'lib/transformers/hf_hub/constants.rb', line 8 def self._is_true(value) if value.nil? return false end ENV_VARS_TRUE_VALUES.include?(value.upcase) end |
._normalize_etag(etag) ⇒ Object
173 174 175 176 177 178 |
# File 'lib/transformers/hf_hub/file_download.rb', line 173 def _normalize_etag(etag) if etag.nil? return nil end etag.sub(/\A\W/, "").delete('"') end |
._raise_on_head_call_error(head_call_error, force_download, local_files_only) ⇒ Object
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 |
# File 'lib/transformers/hf_hub/file_download.rb', line 613 def _raise_on_head_call_error(head_call_error, force_download, local_files_only) # No head call => we cannot force download. if force_download if local_files_only raise ArgumentError, "Cannot pass 'force_download: true' and 'local_files_only: true' at the same time." elsif head_call_error.is_a?(OfflineModeIsEnabled) raise ArgumentError, "Cannot pass 'force_download: true' when offline mode is enabled." else raise ArgumentError, "Force download failed due to the above error." end end # If we couldn't find an appropriate file on disk, raise an error. # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only raise LocalEntryNotFoundError, "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable" + " hf.co look-ups and downloads online, set 'local_files_only' to false." elsif head_call_error.is_a?(RepositoryNotFoundError) || head_call_error.is_a?(GatedRepoError) # Repo not found or gated => let's raise the actual error raise head_call_error else # Otherwise: most likely a connection issue or Hub downtime => let's warn the user raise LocalEntryNotFoundError, "An error happened while trying to locate the file on the Hub and we cannot find the requested files" + " in the local cache. Please check your connection and try again or make sure your Internet connection" + " is on." end end |
._request_wrapper(method, url, follow_relative_redirects: false, redirects: 0, **params) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/transformers/hf_hub/file_download.rb', line 60 def _request_wrapper(method, url, follow_relative_redirects: false, redirects: 0, **params) # Recursively follow relative redirects if follow_relative_redirects if redirects > 10 raise "Too many redirects" end response = _request_wrapper( method, url, follow_relative_redirects: false, **params ) # If redirection, we redirect only relative paths. # This is useful in case of a renamed repository. if response.is_a?(Net::HTTPRedirection) parsed_target = URI.parse(response["Location"]) if netloc(parsed_target) == "" # This means it is a relative 'location' headers, as allowed by RFC 7231. # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource') # We want to follow this relative redirect ! # # Highly inspired by `resolve_redirects` from requests library. # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159 next_url = URI.parse(url) next_url.path = parsed_target.path return _request_wrapper(method, next_url, follow_relative_redirects: true, redirects: redirects + 1, **params) end end return response end # Perform request and return if status_code is not in the retry list. uri = URI.parse(url) = {use_ssl: true} if params[:timeout] [:open_timeout] = params[:timeout] [:read_timeout] = params[:timeout] [:write_timeout] = params[:timeout] end response = Net::HTTP.start(uri.host, uri.port, **) do |http| http.send_request(method, uri.path, nil, params[:headers]) end response.uri ||= uri hf_raise_for_status(response) response end |
._validate_token_to_send(token, is_write_action) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/transformers/hf_hub/utils/_headers.rb', line 82 def _validate_token_to_send(token, is_write_action) if is_write_action if token.nil? raise ArgumentError, "Token is required (write-access action) but no token found. You need" + " to provide a token or be logged in to Hugging Face with" + " `huggingface-cli login` or `huggingface_hub.login`. See" + " https://huggingface.co/settings/tokens." end end end |
.build_hf_headers(token: nil, is_write_action: false, library_name: nil, library_version: nil, user_agent: nil, headers: nil) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/transformers/hf_hub/utils/_headers.rb', line 18 def build_hf_headers( token: nil, is_write_action: false, library_name: nil, library_version: nil, user_agent: nil, headers: nil ) # Get auth token to send token_to_send = get_token_to_send(token) _validate_token_to_send(token_to_send, is_write_action) # Combine headers hf_headers = { "user-agent" => _http_user_agent( library_name: library_name, library_version: library_version, user_agent: user_agent ) } if !token_to_send.nil? hf_headers["authorization"] = "Bearer #{token_to_send}" end if headers hf_headers.merge!(headers) end hf_headers end |
.display_progress(filename, width, size, expected_size) ⇒ Object
755 756 757 758 759 760 761 |
# File 'lib/transformers/hf_hub/file_download.rb', line 755 def display_progress(filename, width, size, expected_size) = width - (filename.length + 3) progress = size / expected_size.to_f done = (progress * ).round not_done = - done "#{filename} |#{"█" * done}#{" " * not_done}|" end |
.get_hf_file_metadata(url, token: nil, proxies: nil, timeout: DEFAULT_REQUEST_TIMEOUT, library_name: nil, library_version: nil, user_agent: nil, headers: nil) ⇒ Object
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 |
# File 'lib/transformers/hf_hub/file_download.rb', line 483 def ( url, token: nil, proxies: nil, timeout: DEFAULT_REQUEST_TIMEOUT, library_name: nil, library_version: nil, user_agent: nil, headers: nil ) headers = build_hf_headers( token: token, library_name: library_name, library_version: library_version, user_agent: user_agent, headers: headers ) headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file # Retrieve metadata r = _request_wrapper( "HEAD", url, headers: headers, allow_redirects: false, follow_relative_redirects: true, proxies: proxies, timeout: timeout ) hf_raise_for_status(r) # Return HfFileMetadata.new( commit_hash: r[HUGGINGFACE_HEADER_X_REPO_COMMIT], # We favor a custom header indicating the etag of the linked resource, and # we fallback to the regular etag header. etag: _normalize_etag(r[HUGGINGFACE_HEADER_X_LINKED_ETAG] || r["etag"]), # Either from response headers (if redirected) or defaults to request url # Do not use directly `url`, as `_request_wrapper` might have followed relative # redirects. location: r["location"] || r.uri.to_s, size: _int_or_none(r[HUGGINGFACE_HEADER_X_LINKED_SIZE] || r["content-length"]) ) end |
.get_token_to_send(token) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/transformers/hf_hub/utils/_headers.rb', line 47 def get_token_to_send(token) # Case token is explicitly provided if token.is_a?(String) return token end # Case token is explicitly forbidden if token == false return nil end # Token is not provided: we get it from local cache cached_token = nil # get_token # Case token is explicitly required if token == true if cached_token.nil? raise LocalTokenNotFoundError, "Token is required (`token: true`), but no token found. You" + " need to provide a token or be logged in to Hugging Face with" + " `huggingface-cli login` or `huggingface_hub.login`. See" + " https://huggingface.co/settings/tokens." end return cached_token end # Case implicit use of the token is forbidden by env variable if HF_HUB_DISABLE_IMPLICIT_TOKEN return nil end # Otherwise: we use the cached token as the user has not explicitly forbidden it cached_token end |
.hf_hub_download(repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, library_name: nil, library_version: nil, cache_dir: nil, local_dir: nil, local_dir_use_symlinks: "auto", user_agent: nil, force_download: false, force_filename: nil, proxies: nil, etag_timeout: DEFAULT_ETAG_TIMEOUT, resume_download: false, token: nil, local_files_only: false, legacy_cache_layout: false, endpoint: nil) ⇒ Object
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 |
# File 'lib/transformers/hf_hub/file_download.rb', line 217 def hf_hub_download( repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, library_name: nil, library_version: nil, cache_dir: nil, local_dir: nil, local_dir_use_symlinks: "auto", user_agent: nil, force_download: false, force_filename: nil, proxies: nil, etag_timeout: DEFAULT_ETAG_TIMEOUT, resume_download: false, token: nil, local_files_only: false, legacy_cache_layout: false, endpoint: nil ) if cache_dir.nil? cache_dir = HF_HUB_CACHE end if revision.nil? revision = DEFAULT_REVISION end if subfolder == "" subfolder = nil end if !subfolder.nil? # This is used to create a URL, and not a local path, hence the forward slash. filename = "#{subfolder}/#{filename}" end if repo_type.nil? repo_type = "model" end if !REPO_TYPES.include?(repo_type) raise ArgumentError, "Invalid repo type: #{repo_type}. Accepted repo types are: #{REPO_TYPES}" end headers = build_hf_headers( token: token, library_name: library_name, library_version: library_version, user_agent: user_agent ) if !local_dir.nil? raise Todo else _hf_hub_download_to_cache_dir( # Destination cache_dir: cache_dir, # File info repo_id: repo_id, filename: filename, repo_type: repo_type, revision: revision, # HTTP info endpoint: endpoint, etag_timeout: etag_timeout, headers: headers, proxies: proxies, token: token, # Additional options local_files_only: local_files_only, force_download: force_download ) end end |
.hf_hub_url(repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, endpoint: nil) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/transformers/hf_hub/file_download.rb', line 24 def hf_hub_url( repo_id, filename, subfolder: nil, repo_type: nil, revision: nil, endpoint: nil ) if subfolder == "" subfolder = nil end if !subfolder.nil? filename = "#{subfolder}/#{filename}" end if !REPO_TYPES.include?(repo_type) raise ArgumentError, "Invalid repo type" end if REPO_TYPES_URL_PREFIXES.include?(repo_type) repo_id = REPO_TYPES_URL_PREFIXES[repo_type] + repo_id end if revision.nil? revision = DEFAULT_REVISION end url = HUGGINGFACE_CO_URL_TEMPLATE % {repo_id: repo_id, revision: CGI.escape(revision), filename: CGI.escape(filename)} # Update endpoint if provided if !endpoint.nil? && url.start_with?(ENDPOINT) url = endpoint + url[ENDPOINT.length..] end url end |
.hf_raise_for_status(response, endpoint_name: nil) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/transformers/hf_hub/utils/_errors.rb', line 24 def hf_raise_for_status(response, endpoint_name: nil) begin response.value unless response.is_a?(Net::HTTPRedirection) rescue => e error_code = response["X-Error-Code"] = response["X-Error-Message"] if error_code == "RevisionNotFound" = "#{response.code} Client Error." + "\n\n" + "Revision Not Found for url: #{response.uri}." raise RevisionNotFoundError.new(, response) elsif error_code == "EntryNotFound" = "#{response.code} Client Error." + "\n\n" + "Entry Not Found for url: #{response.uri}." raise EntryNotFoundError.new(, response) elsif error_code == "GatedRepo" = ( "#{response.code} Client Error." + "\n\n" + "Cannot access gated repo for url #{response.uri}." ) raise GatedRepoError.new(, response) elsif == "Access to this resource is disabled." = ( "#{response.code} Client Error." + "\n\n" + "Cannot access repository for url #{response.uri}." + "\n" + "Access to this resource is disabled." ) raise DisabledRepoError.new(, response) elsif error_code == "RepoNotFound" # 401 is misleading as it is returned for: # - private and gated repos if user is not authenticated # - missing repos # => for now, we process them as `RepoNotFound` anyway. # See https://gist.github.com/Wauplin/46c27ad266b15998ce56a6603796f0b9 = ( "#{response.code} Client Error." + "\n\n" + "Repository Not Found for url: #{response.uri}." + "\nPlease make sure you specified the correct `repo_id` and" + " `repo_type`.\nIf you are trying to access a private or gated repo," + " make sure you are authenticated." ) raise RepositoryNotFoundError.new(, response) elsif response.code.to_i == 400 = ( !endpoint_name.nil? ? "\n\nBad request for #{endpoint_name} endpoint:" : "\n\nBad request:" ) raise BadRequestError.new(, response) elsif response.code.to_i == 403 = ( "\n\n{response.code} Forbidden: #{}." + "\nCannot access content at: #{response.uri}." + "\nIf you are trying to create or update content, " + "make sure you have a token with the `write` role." ) raise HfHubHTTPError.new(, response) end # Convert `HTTPError` into a `HfHubHTTPError` to display request information # as well (request id and/or server error message) raise HfHubHTTPError.new(e.to_s, response) end end |
.http_get(url, temp_file, proxies: nil, resume_size: 0, headers: nil, expected_size: nil, displayed_filename: nil, _nb_retries: 5) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/transformers/hf_hub/file_download.rb', line 111 def http_get( url, temp_file, proxies: nil, resume_size: 0, headers: nil, expected_size: nil, displayed_filename: nil, _nb_retries: 5 ) uri = URI.parse(url) if resume_size > 0 headers["range"] = "bytes=%d-" % [resume_size] end size = resume_size Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http| request = Net::HTTP::Get.new(uri) headers.each do |k, v| request[k] = v end http.request(request) do |response| case response when Net::HTTPSuccess if displayed_filename.nil? displayed_filename = url content_disposition = response["content-disposition"] if !content_disposition.nil? match = HEADER_FILENAME_PATTERN.match(content_disposition) if !match.nil? # Means file is on CDN displayed_filename = match["filename"] end end end stream = STDERR tty = stream.tty? width = tty ? stream.winsize[1] : 80 response.read_body do |chunk| temp_file.write(chunk) size += chunk.bytesize if tty stream.print "\r#{display_progress(displayed_filename, width, size, expected_size)}" end end if tty stream.puts else stream.puts display_progress(displayed_filename, width, size, expected_size) end else hf_raise_for_status(response) end end end end |
.netloc(uri) ⇒ Object
additional methods
739 740 741 |
# File 'lib/transformers/hf_hub/file_download.rb', line 739 def netloc(uri) [uri.host, uri.port].compact.join(":") end |
.parents(path) ⇒ Object
743 744 745 746 747 748 749 750 751 752 753 |
# File 'lib/transformers/hf_hub/file_download.rb', line 743 def parents(path) parents = [] 100.times do if path == path.parent break end path = path.parent parents << path end parents end |
.repo_folder_name(repo_id:, repo_type:) ⇒ Object
207 208 209 210 211 |
# File 'lib/transformers/hf_hub/file_download.rb', line 207 def repo_folder_name(repo_id:, repo_type:) # remove all `/` occurrences to correctly convert repo to directory name parts = ["#{repo_type}s"] + repo_id.split("/") parts.join(REPO_ID_SEPARATOR) end |
.try_to_load_from_cache(repo_id, filename, cache_dir: nil, revision: nil, repo_type: nil) ⇒ Object
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 |
# File 'lib/transformers/hf_hub/file_download.rb', line 424 def try_to_load_from_cache( repo_id, filename, cache_dir: nil, revision: nil, repo_type: nil ) if revision.nil? revision = "main" end if repo_type.nil? repo_type = "model" end if !REPO_TYPES.include?(repo_type) raise ArgumentError, "Invalid repo type: #{repo_type}. Accepted repo types are: #{REPO_TYPES}" end if cache_dir.nil? cache_dir = HF_HUB_CACHE end object_id = repo_id.gsub("/", "--") repo_cache = File.join(cache_dir, "#{repo_type}s--#{object_id}") if !Dir.exist?(repo_cache) # No cache for this model return nil end refs_dir = File.join(repo_cache, "refs") snapshots_dir = File.join(repo_cache, "snapshots") no_exist_dir = File.join(repo_cache, ".no_exist") # Resolve refs (for instance to convert main to the associated commit sha) if Dir.exist?(refs_dir) revision_file = File.join(refs_dir, revision) if File.exist?(revision_file) revision = File.read(revision_file) end end # Check if file is cached as "no_exist" if File.exist?(File.join(no_exist_dir, revision, filename)) return CACHED_NO_EXIST end # Check if revision folder exists if !Dir.exist?(snapshots_dir) return nil end cached_shas = Dir.glob("*", base: snapshots_dir) if !cached_shas.include?(revision) # No cache for this revision and we won't try to return a random revision return nil end # Check if file exists in cache cached_file = File.join(snapshots_dir, revision, filename) File.exist?(cached_file) ? cached_file : nil end |