Module: BlingFire

Defined in:
lib/blingfire.rb,
lib/blingfire/ffi.rb,
lib/blingfire/model.rb,
lib/blingfire/version.rb

Defined Under Namespace

Modules: FFI Classes: Error, Model

Constant Summary collapse

VERSION =
"0.2.0"

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.ffi_libObject

Returns the value of attribute ffi_lib.



12
13
14
# File 'lib/blingfire.rb', line 12

def ffi_lib
  @ffi_lib
end

Class Method Details

.change_settings_dummy_prefix(model, value) ⇒ Object

Raises:



138
139
140
141
142
# File 'lib/blingfire.rb', line 138

def change_settings_dummy_prefix(model, value)
  # use opposite of value
  ret = FFI.SetNoDummyPrefix(model, value ? 0 : 1)
  raise Error, "Bad status: #{ret}" if ret != 1
end

.free_model(model) ⇒ Object



125
126
127
# File 'lib/blingfire.rb', line 125

def free_model(model)
  FFI.FreeModel(model)
end

.ids_to_text(model, ids, skip_special_tokens: true, output_buffer_size: nil) ⇒ Object



116
117
118
119
120
121
122
123
# File 'lib/blingfire.rb', line 116

def ids_to_text(model, ids, skip_special_tokens: true, output_buffer_size: nil)
  output_buffer_size ||= ids.size * 32
  c_ids = Fiddle::Pointer[ids.pack("i*")]
  out = Fiddle::Pointer.malloc(output_buffer_size)
  out_size = FFI.IdsToText(model, c_ids, ids.size, out, output_buffer_size, skip_special_tokens ? 1 : 0)
  check_status out_size, out
  encode_utf8(out.to_str(out_size - 1))
end

.lib_versionObject



37
38
39
# File 'lib/blingfire.rb', line 37

def lib_version
  FFI.GetBlingFireTokVersion
end

.load_model(path, **options) ⇒ Object



41
42
43
# File 'lib/blingfire.rb', line 41

def load_model(path, **options)
  Model.new(path, **options)
end

.normalize_spaces(text) ⇒ Object



129
130
131
132
133
134
135
136
# File 'lib/blingfire.rb', line 129

def normalize_spaces(text)
  u_space = 0x20
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
  out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
  check_status out_size, out
  encode_utf8(out.to_str(out_size))
end

.text_to_ids(model, text, max_len = nil, unk_id = 0) ⇒ Object



93
94
95
96
97
98
99
# File 'lib/blingfire.rb', line 93

def text_to_ids(model, text, max_len = nil, unk_id = 0)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
  out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
  check_status out_size, ids
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
end

.text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0) ⇒ Object



101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/blingfire.rb', line 101

def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)

  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)

  out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)

  check_status out_size, ids

  result = ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
  [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
end

.text_to_sentences(text) ⇒ Object



69
70
71
72
73
# File 'lib/blingfire.rb', line 69

def text_to_sentences(text)
  text_to(text, "\n") do |t, out|
    FFI.TextToSentences(t, t.bytesize, out, out.size)
  end
end

.text_to_sentences_with_model(model, text) ⇒ Object



75
76
77
78
79
# File 'lib/blingfire.rb', line 75

def text_to_sentences_with_model(model, text)
  text_to(text, "\n") do |t, out|
    FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
  end
end

.text_to_sentences_with_offsets(text) ⇒ Object



81
82
83
84
85
# File 'lib/blingfire.rb', line 81

def text_to_sentences_with_offsets(text)
  text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
    FFI.TextToSentencesWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
  end
end

.text_to_sentences_with_offsets_with_model(model, text) ⇒ Object



87
88
89
90
91
# File 'lib/blingfire.rb', line 87

def text_to_sentences_with_offsets_with_model(model, text)
  text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
    FFI.TextToSentencesWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
  end
end

.text_to_words(text) ⇒ Object



45
46
47
48
49
# File 'lib/blingfire.rb', line 45

def text_to_words(text)
  text_to(text, " ") do |t, out|
    FFI.TextToWords(t, t.bytesize, out, out.size)
  end
end

.text_to_words_with_model(model, text) ⇒ Object



51
52
53
54
55
# File 'lib/blingfire.rb', line 51

def text_to_words_with_model(model, text)
  text_to(text, " ") do |t, out|
    FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
  end
end

.text_to_words_with_offsets(text) ⇒ Object



57
58
59
60
61
# File 'lib/blingfire.rb', line 57

def text_to_words_with_offsets(text)
  text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
    FFI.TextToWordsWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
  end
end

.text_to_words_with_offsets_with_model(model, text) ⇒ Object



63
64
65
66
67
# File 'lib/blingfire.rb', line 63

def text_to_words_with_offsets_with_model(model, text)
  text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
    FFI.TextToWordsWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
  end
end