Class: Rarff::Relation
- Inherits:
-
Object
- Object
- Rarff::Relation
- Defined in:
- lib/rarff.rb
Instance Attribute Summary collapse
-
#attributes ⇒ Object
Returns the value of attribute attributes.
-
#instances ⇒ Object
Returns the value of attribute instances.
-
#name ⇒ Object
Returns the value of attribute name.
Instance Method Summary collapse
- #create_attributes(attr_parse = false) ⇒ Object
- #expand_sparse(str) ⇒ Object
-
#initialize(name = '') ⇒ Relation
constructor
A new instance of Relation.
- #parse(str) ⇒ Object
-
#set_string_attributes_to_nominal(column_indices = nil) ⇒ Object
Make all String type attributes into nominal attributes, because they are more useful in WEKA because more techniques handle them than strings.
- #to_arff(sparse = false) ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(name = '') ⇒ Relation
Returns a new instance of Relation.
129 130 131 132 133 |
# File 'lib/rarff.rb', line 129 def initialize(name='') @name = name @attributes = Array.new @instances = Array.new end |
Instance Attribute Details
#attributes ⇒ Object
Returns the value of attribute attributes.
125 126 127 |
# File 'lib/rarff.rb', line 125 def attributes @attributes end |
#instances ⇒ Object
Returns the value of attribute instances.
126 127 128 |
# File 'lib/rarff.rb', line 126 def instances @instances end |
#name ⇒ Object
Returns the value of attribute name.
125 126 127 |
# File 'lib/rarff.rb', line 125 def name @name end |
Instance Method Details
#create_attributes(attr_parse = false) ⇒ Object
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/rarff.rb', line 176 def create_attributes(attr_parse=false) raise Exception, "Not enough data to create ARFF attributes" if @instances.nil? or @instances.empty? or @instances[0].empty? # Keep track of whether an attribute has been defined or not. # The only reason an attribute would not be defined in the first # row is if it has nil's in it. The geek inside screams for a binary # encoding like chmod but eh. attributes_defined = {} @instances.each_with_index { |row, i| row.each_with_index { |col, j| next if attributes_defined[j] or col.nil? attributes_defined[j] = true #whatever happens, we are going to define it if attr_parse if col =~ /^\-?\d+\.?\d*$/ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC) end next #parse next column - this one is finished end # No parsing - just take it how it is if col.kind_of?(Numeric) @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC) elsif col.kind_of?(String) @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING) elsif col == false or col == true #exactly equal to a boolean @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_BOOLEAN) else raise Exception, "Could not parse attribute to ARFF data type: #{col.inspect}" end } } # Make sure all attributes have a definition, because otherwise # needless errors are thrown @instances[0].each_index do |i| @attributes[i] ||= Attribute.new("Attr#{i}", ATTRIBUTE_NUMERIC) end end |
#expand_sparse(str) ⇒ Object
244 245 246 247 248 249 250 251 |
# File 'lib/rarff.rb', line 244 def (str) arr = Array.new(@attributes.size, 0) str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr| pra = pr.split(/\s/) arr[pra[0].to_i] = pra[1] } arr end |
#parse(str) ⇒ Object
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/rarff.rb', line 136 def parse(str) in_data_section = false # TODO: Doesn't handle commas in quoted attributes. str.split("\n").each { |line| next if line =~ /^\s*$/ next if line =~ /^\s*#{COMMENT_MARKER}/ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name } next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type| @attributes.push(Attribute.new(name, type)) } next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true } next if in_data_section == false ## Below is data section handling # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data| next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data| # Sparse ARFF # TODO: Factor duplication with non-sparse data below @instances << (data.first) create_attributes(true) } next if line.my_scan(/^\s*(.*)\s*$/) { |data| @instances << data.first.split(/,\s*/).map { |field| # Remove outer single quotes on strings, if any ('foo bar' --> foo bar) field.gsub(/^\s*\'(.*)\'\s*$/, "\\1") } create_attributes(true) } } end |
#set_string_attributes_to_nominal(column_indices = nil) ⇒ Object
Make all String type attributes into nominal attributes, because they are more useful in WEKA because more techniques handle them than strings.
column_indices is an optional argumetn specifying the columns that are to be set to nominal (0 based indexes). if nil (the default), then all columns are included
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
# File 'lib/rarff.rb', line 225 def set_string_attributes_to_nominal(column_indices = nil) nominals = {} # Frustratingly, we have to traverse this 2D array with the # wrong dimension first. Oh well. @instances.each_with_index do |row, row_index| row.each_with_index do |string, col_index| next unless @attributes[col_index].type == ATTRIBUTE_STRING next unless column_indices.nil? or column_indices.include?(col_index) nominals[col_index] ||= {} nominals[col_index][string] ||= true end end nominals.each do |index, strings| @attributes[index].type = "{#{strings.keys.join(',')}}" end end |
#to_arff(sparse = false) ⇒ Object
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 |
# File 'lib/rarff.rb', line 254 def to_arff(sparse=false) RELATION_MARKER + " #{@name}\n" + @attributes.join("\n") + "\n" + DATA_MARKER + "\n" + @instances.map { |inst| mapped = inst.map_with_index { |col, i| # First pass - quote strings with spaces, and dates # TODO: Doesn't handle cases in which strings already contain # quotes or are already quoted. unless col.nil? if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i if col =~ /[,\s+]/ col = "'" + col + "'" end elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh. col = '"' + col + '"' end end # Do the final output if sparse if col.nil? or (@attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0) nil else "#{i} #{col}" end else if col.nil? MISSING else col end end } if sparse mapped.reject{|col| col.nil?}.join(', ') else mapped.join(", ") end }.join("\n") end |
#to_s ⇒ Object
301 302 303 |
# File 'lib/rarff.rb', line 301 def to_s to_arff end |