Class: Transformers::PreTrainedTokenizerBase
- Inherits:
-
Object
- Object
- Transformers::PreTrainedTokenizerBase
- Extended by:
- ClassAttribute
- Includes:
- SpecialTokensMixin
- Defined in:
- lib/transformers/tokenization_utils_base.rb
Direct Known Subclasses
Constant Summary
Constants included from SpecialTokensMixin
SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES
Instance Attribute Summary collapse
-
#init_kwargs ⇒ Object
readonly
Returns the value of attribute init_kwargs.
-
#model_max_length ⇒ Object
readonly
Returns the value of attribute model_max_length.
Class Method Summary collapse
- ._from_pretrained(resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, token: nil, cache_dir: nil, local_files_only: false, _commit_hash: nil, _is_local: false, trust_remote_code: false, **kwargs) ⇒ Object
- .from_pretrained(pretrained_model_name_or_path, *init_inputs, cache_dir: nil, force_download: false, local_files_only: false, token: nil, revision: "main", trust_remote_code: false, **kwargs) ⇒ Object
Instance Method Summary collapse
- #_eventual_warn_about_too_long_sequence(ids, max_length, verbose) ⇒ Object
- #call(text, text_pair: nil, text_target: nil, text_pair_target: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, stride: 0, is_split_into_words: false, pad_to_multiple_of: nil, return_tensors: nil, return_token_type_ids: nil, return_attention_mask: nil, return_overflowing_tokens: false, return_special_tokens_mask: false, return_offsets_mapping: false, return_length: false, verbose: true, **kwargs) ⇒ Object
-
#initialize(**kwargs) ⇒ PreTrainedTokenizerBase
constructor
A new instance of PreTrainedTokenizerBase.
Methods included from ClassAttribute
Methods included from SpecialTokensMixin
#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id
Constructor Details
#initialize(**kwargs) ⇒ PreTrainedTokenizerBase
Returns a new instance of PreTrainedTokenizerBase.
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
# File 'lib/transformers/tokenization_utils_base.rb', line 210 def initialize(**kwargs) # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) @init_inputs = [] @init_kwargs = kwargs.dup # copy.deepcopy(kwargs) @name_or_path = kwargs.delete(:name_or_path) { "" } @processor_class = kwargs.delete(:processor_class) # For backward compatibility we fallback to set model_max_length from max_len if provided model_max_length = kwargs.delete(:model_max_length) { kwargs.delete(:max_len) } @model_max_length = !model_max_length.nil? ? model_max_length : VERY_LARGE_INTEGER # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it # is changed. @padding_side = kwargs.delete(:padding_side) { self.class.padding_side } if !["right", "left"].include?(@padding_side) raise ArgumentError, "Padding side should be selected between 'right' and 'left', current value: #{@padding_side}" end @truncation_side = kwargs.delete(:truncation_side) { self.class.truncation_side } if !["right", "left"].include?(@truncation_side) raise ArgumentError, "Truncation side should be selected between 'right' and 'left', current value: #{@truncation_side}" end @model_input_names = kwargs.delete(:model_input_names) { self.class.model_input_names } # By default, cleaning tokenization spaces for both fast and slow tokenizers @clean_up_tokenization_spaces = kwargs.delete(:clean_up_tokenization_spaces) { true } # By default, do not split special tokens for both fast and slow tokenizers @split_special_tokens = kwargs.delete(:split_special_tokens) { false } @deprecation_warnings = {} @in_target_context_manager = false # Stores a Jinja template that formats chat histories into tokenizable strings @chat_template = kwargs.delete(:chat_template) if @chat_template.is_a?(Array) # Chat templates are stored as lists of dicts with fixed key names, # we reconstruct that into a single dict while loading them. @chat_template = @chat_template.to_h { |template| [template["name"], template["template"]] } end super end |
Instance Attribute Details
#init_kwargs ⇒ Object (readonly)
Returns the value of attribute init_kwargs.
208 209 210 |
# File 'lib/transformers/tokenization_utils_base.rb', line 208 def init_kwargs @init_kwargs end |
#model_max_length ⇒ Object (readonly)
Returns the value of attribute model_max_length.
208 209 210 |
# File 'lib/transformers/tokenization_utils_base.rb', line 208 def model_max_length @model_max_length end |
Class Method Details
._from_pretrained(resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, token: nil, cache_dir: nil, local_files_only: false, _commit_hash: nil, _is_local: false, trust_remote_code: false, **kwargs) ⇒ Object
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 |
# File 'lib/transformers/tokenization_utils_base.rb', line 813 def _from_pretrained( resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, token: nil, cache_dir: nil, local_files_only: false, _commit_hash: nil, _is_local: false, trust_remote_code: false, **kwargs ) # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json # file or if `from_slow` is set to True. from_slow = kwargs.delete(:from_slow) { false } has_tokenizer_file = !resolved_vocab_files[:tokenizer_file].nil? if (from_slow || !has_tokenizer_file) && !slow_tokenizer_class.nil? slow_tokenizer = slow_tokenizer_class._from_pretrained( Copy.deepcopy(resolved_vocab_files), pretrained_model_name_or_path, Copy.deepcopy(init_configuration), *init_inputs, token: token, cache_dir: cache_dir, local_files_only: local_files_only, _commit_hash: _commit_hash, **Copy.deepcopy(kwargs) ) else slow_tokenizer = nil end # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.delete(:tokenizer_config_file) if !tokenizer_config_file.nil? init_kwargs = JSON.load_file(tokenizer_config_file).transform_keys(&:to_sym) # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. config_tokenizer_class = init_kwargs[:tokenizer_class] init_kwargs.delete(:tokenizer_class) if !has_tokenizer_file init_kwargs.delete(:tokenizer_file) end saved_init_inputs = init_kwargs.delete(:init_inputs) { [] } if init_inputs.empty? init_inputs = saved_init_inputs end else config_tokenizer_class = nil init_kwargs = init_configuration end if config_tokenizer_class.nil? config = AutoConfig.from_pretrained( pretrained_model_name_or_path, token: token, cache_dir: cache_dir, local_files_only: local_files_only, trust_remote_code: trust_remote_code, _commit_hash: _commit_hash, ) config_tokenizer_class = config.tokenizer_class if config_tokenizer_class.nil? # Third attempt. If we have not yet found the original type of the tokenizer, # we are loading we see if we can infer it from the type of the configuration file if config.class.model_type model_type = config.class.model_type else # Fallback: use pattern matching on the string. model_type = nil TOKENIZER_MAPPING_NAMES.each_key do |pattern| if pretrained_model_name_or_path.to_s.include?(pattern) model_type = pattern break end end end if !model_type.nil? config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.fetch(model_type, [nil, nil]) if config_tokenizer_class.nil? config_tokenizer_class = config_tokenizer_class_fast end end end end if !config_tokenizer_class.nil? if name.split("::").last.gsub("Fast", "") != config_tokenizer_class.gsub("Fast", "") raise Todo end end # Update with newly provided kwargs init_kwargs.merge!(kwargs) # Merge resolved_vocab_files arguments in init_kwargs. _added_tokens_file = resolved_vocab_files.delete(:added_tokens_file) _special_tokens_map_file = resolved_vocab_files.delete(:special_tokens_map_file) resolved_vocab_files.each do |args_name, file_path| if !init_kwargs.include?(args_name) init_kwargs[args_name] = file_path end end _tokenizer_file = resolved_vocab_files.delete(:tokenizer_file) if !slow_tokenizer.nil? init_kwargs[:__slow_tokenizer] = slow_tokenizer end init_kwargs[:name_or_path] = pretrained_model_name_or_path # Instantiate the tokenizer. tokenizer = new(*init_inputs, **init_kwargs) tokenizer end |
.from_pretrained(pretrained_model_name_or_path, *init_inputs, cache_dir: nil, force_download: false, local_files_only: false, token: nil, revision: "main", trust_remote_code: false, **kwargs) ⇒ Object
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 |
# File 'lib/transformers/tokenization_utils_base.rb', line 668 def from_pretrained( pretrained_model_name_or_path, *init_inputs, cache_dir: nil, force_download: false, local_files_only: false, token: nil, revision: "main", trust_remote_code: false, **kwargs ) resume_download = kwargs.delete(:resume_download) { false } proxies = kwargs.delete(:proxies) subfolder = kwargs.delete(:subfolder) from_pipeline = kwargs.delete(:_from_pipeline) from_auto_class = kwargs.delete(:_from_auto) { false } commit_hash = kwargs.delete(:_commit_hash) user_agent = {file_type: "tokenizer", from_auto_class: from_auto_class, is_fast: name.include?("Fast")} if !from_pipeline.nil? user_agent[:using_pipeline] = from_pipeline end if Utils::Hub.is_offline_mode && !local_files_only Transformers.logger.info("Offline mode: forcing local_files_only: true") local_files_only = true end pretrained_model_name_or_path = pretrained_model_name_or_path.to_s vocab_files = {} init_configuration = {} is_local = Dir.exist?(pretrained_model_name_or_path) single_file_id = nil if File.exist?(pretrained_model_name_or_path) raise Todo end # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { added_tokens_file: ADDED_TOKENS_FILE, # kept only for legacy special_tokens_map_file: SPECIAL_TOKENS_MAP_FILE, # kept only for legacy tokenizer_config_file: TOKENIZER_CONFIG_FILE, # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders tokenizer_file: FULL_TOKENIZER_FILE } vocab_files = vocab_files_names.merge(additional_files_names) if vocab_files[:tokenizer_file] # Try to get the tokenizer config to see if there are versioned tokenizer files. fast_tokenizer_file = FULL_TOKENIZER_FILE resolved_config_file = Utils::Hub.cached_file( pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, cache_dir: cache_dir, force_download: force_download, resume_download: resume_download, proxies: proxies, token: token, revision: revision, local_files_only: local_files_only, subfolder: subfolder, user_agent: user_agent, _raise_exceptions_for_gated_repo: false, _raise_exceptions_for_missing_entries: false, _raise_exceptions_for_connection_errors: false, _commit_hash: commit_hash ) commit_hash = Utils::Hub.extract_commit_hash(resolved_config_file, commit_hash) if !resolved_config_file.nil? tokenizer_config = JSON.load_file(resolved_config_file) if tokenizer_config["fast_tokenizer_files"] fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"]) end end vocab_files[:tokenizer_file] = fast_tokenizer_file end # Get files from url, cache, or disk depending on the case resolved_vocab_files = {} unresolved_files = [] vocab_files.each do |file_id, file_path| if file_path.nil? resolved_vocab_files[file_id] = nil elsif single_file_id == file_id if File.exist?(file_path) resolved_vocab_files[file_id] = file_path else raise Todo end else resolved_vocab_files[file_id] = Utils::Hub.cached_file( pretrained_model_name_or_path, file_path, cache_dir: cache_dir, force_download: force_download, proxies: proxies, resume_download: resume_download, local_files_only: local_files_only, token: token, user_agent: user_agent, revision: revision, subfolder: subfolder, _raise_exceptions_for_gated_repo: false, _raise_exceptions_for_missing_entries: false, _raise_exceptions_for_connection_errors: false, _commit_hash: commit_hash, ) commit_hash = Utils::Hub.extract_commit_hash(resolved_vocab_files[file_id], commit_hash) end end # not used? if unresolved_files.length > 0 raise Todo end vocab_files.each do |file_id, file_path| if !resolved_vocab_files.include?(file_id) next end if is_local Transformers.logger.info("loading file #{file_path}") else Transformers.logger.info("loading file #{file_path} from cache at #{resolved_vocab_files[file_id] || "nil"}") end end _from_pretrained( resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, token: token, cache_dir: cache_dir, local_files_only: local_files_only, _commit_hash: commit_hash, _is_local: is_local, trust_remote_code: trust_remote_code, **kwargs ) end |
Instance Method Details
#_eventual_warn_about_too_long_sequence(ids, max_length, verbose) ⇒ Object
255 256 257 258 259 |
# File 'lib/transformers/tokenization_utils_base.rb', line 255 def _eventual_warn_about_too_long_sequence(ids, max_length, verbose) if max_length.nil? && ids.length > @model_max_length && verbose raise Todo end end |
#call(text, text_pair: nil, text_target: nil, text_pair_target: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, stride: 0, is_split_into_words: false, pad_to_multiple_of: nil, return_tensors: nil, return_token_type_ids: nil, return_attention_mask: nil, return_overflowing_tokens: false, return_special_tokens_mask: false, return_offsets_mapping: false, return_length: false, verbose: true, **kwargs) ⇒ Object
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 |
# File 'lib/transformers/tokenization_utils_base.rb', line 261 def call( text, text_pair: nil, text_target: nil, text_pair_target: nil, add_special_tokens: true, padding: false, truncation: nil, max_length: nil, stride: 0, is_split_into_words: false, pad_to_multiple_of: nil, return_tensors: nil, return_token_type_ids: nil, return_attention_mask: nil, return_overflowing_tokens: false, return_special_tokens_mask: false, return_offsets_mapping: false, return_length: false, verbose: true, **kwargs ) # To avoid duplicating all_kwargs = { add_special_tokens: add_special_tokens, padding: padding, truncation: truncation, max_length: max_length, stride: stride, is_split_into_words: is_split_into_words, pad_to_multiple_of: pad_to_multiple_of, return_tensors: return_tensors, return_token_type_ids: return_token_type_ids, return_attention_mask: return_attention_mask, return_overflowing_tokens: return_overflowing_tokens, return_special_tokens_mask: return_special_tokens_mask, return_offsets_mapping: return_offsets_mapping, return_length: return_length, verbose: verbose } all_kwargs.merge!(kwargs) if text.nil? && text_target.nil? raise ArgumentError, "You need to specify either `text` or `text_target`." end if !text.nil? # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the # input mode in this case. if !@in_target_context_manager _switch_to_input_mode end encodings = _call_one(text: text, text_pair: text_pair, **all_kwargs) end if !text_target.nil? _switch_to_target_mode target_encodings = _call_one(text: text_target, text_pair: text_pair_target, **all_kwargs) end # Leave back tokenizer in input mode _switch_to_input_mode if text_target.nil? encodings elsif text.nil? target_encodings else encodings["labels"] = target_encodings["input_ids"] encodings end end |