Class: SVMLight::Model

Inherits:
Object
  • Object
show all
Defined in:
lib/svmredlight/model.rb,
ext/svmredlight.c

Overview

A model is the product of training a SVM, once created it can take documents as inputs and act of them (by for instance classifying them). Models can also be read from files created by svm_learn.

Constant Summary collapse

TYPES =
[:classification]

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.from_file(filename) ⇒ Object

Read a svm_light model from a file generated by svm_learn receives the filename as argument do make sure the file exists before calling this! otherwise exit(1) might be called and the ruby interpreter will die.



36
37
38
39
40
41
42
43
44
45
46
47
# File 'ext/svmredlight.c', line 36

static VALUE
model_read_from_file(VALUE klass, VALUE filename){
  Check_Type(filename, T_STRING);
  MODEL *m;

  m = read_model(StringValuePtr(filename));

  if(is_linear(m))
    add_weight_vector_to_linear_model(m);

  return Data_Wrap_Struct(klass, 0, model_free, m);
}

.learn_classification(r_docs_and_classes, learn_params, kernel_params, use_cache, alpha) ⇒ Object

If no linear



485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
# File 'ext/svmredlight.c', line 485

static VALUE
model_learn_classification(VALUE klass, 
                           VALUE r_docs_and_classes,  // Docs + labels array of arrays
                           VALUE learn_params,        // Options hash with learning options
                           VALUE kernel_params,       // Options hash with kernel options
                           VALUE use_cache,          // If no linear
                           VALUE alpha
                          ){
  int i;
  double *labels = NULL, *alpha_in = NULL;
  long totdocs, totwords = 0,  fnum = 0;
  MODEL  *m = NULL;
  DOC    **c_docs = NULL;
  LEARN_PARM c_learn_param;
  KERNEL_PARM c_kernel_param;
  VALUE temp_ary, exception = rb_eArgError;
  char error_msg[300];

  Check_Type(r_docs_and_classes, T_ARRAY);
  Check_Type(learn_params, T_HASH);
  Check_Type(kernel_params, T_HASH);

  if(!(TYPE(alpha) == T_ARRAY || NIL_P(alpha) ))
    rb_raise(rb_eTypeError, "alpha must be an numeric array or nil");
  
  if(TYPE(alpha) == T_ARRAY){

    alpha_in = my_malloc(sizeof(double) * RARRAY_LEN(alpha));

    for(i=0; i < RARRAY_LEN(alpha); i++){

      if(TYPE(RARRAY_PTR(alpha)[i]) != T_FLOAT && 
         TYPE(RARRAY_PTR(alpha)[i]) != T_FIXNUM ){

        strncpy(error_msg,"All elements of the alpha array must be numeric ", 300);
        goto bail;
      }
      
      alpha_in[i] = NUM2DBL(RARRAY_PTR(alpha)[i]);
    }
  }

  if(setup_learn_params(&c_learn_param, learn_params, error_msg) != 0){
    goto bail;
  }

  c_learn_param.type = CLASSIFICATION;

  if(setup_kernel_params(&c_kernel_param, kernel_params, error_msg) != 0){
    goto bail;
  }

  //TODO Setup kernel cache when we support non linear kernels
  c_kernel_param.kernel_type = LINEAR;

  if(check_kernel_and_learn_params_logic(&c_kernel_param, &c_learn_param, error_msg) != 0){
    goto bail;
  }

  totdocs = (long)RARRAY_LEN(r_docs_and_classes);

  if (totdocs == 0){
    strncpy(error_msg, "Cannot create Model from empty Documents array", 300);
    goto bail;
  }
  
  c_docs  = (DOC **)my_malloc(sizeof(DOC *)*(totdocs)); 
  labels  = (double*)my_malloc(sizeof(double)*totdocs);

  for(i=0; i < totdocs; i++){
    // Just one of the documents and classes arrays, we expect temp_ary to have a Document
    // and a label (long)
    temp_ary = RARRAY_PTR(r_docs_and_classes)[i] ;

    if( TYPE(temp_ary) != T_ARRAY || 
        RARRAY_LEN(temp_ary) < 2  ||
        rb_obj_class(RARRAY_PTR(temp_ary)[0]) != rb_cDocument ||  
        (TYPE(RARRAY_PTR(temp_ary)[1]) != T_FLOAT && TYPE(RARRAY_PTR(temp_ary)[1]) != T_FIXNUM )){
      
      strncpy(error_msg, "All elements of documents and labels should be arrays,"
          "where the first element is a document and the second a number", 300);

      goto bail;
    }
      
    Data_Get_Struct(RARRAY_PTR(temp_ary)[0], DOC, c_docs[i]);
    labels[i] = NUM2DBL(RARRAY_PTR(temp_ary)[1]);

    fnum = 0;

    // Increase feature number while there are still words in the vector
    while(c_docs[i]->fvec->words[fnum].wnum) {
      fnum++;
    }
    
    if(c_docs[i]->fvec->words[fnum -1].wnum > totwords)
      totwords = c_docs[i]->fvec->words[fnum-1].wnum;

    if(totwords > MAXFEATNUM){
      strncpy(error_msg, "The number of features exceeds MAXFEATNUM the maximun "
                    "number of features defined for this version of SVMLight", 300);
      goto bail;
    }
  }
  
  m = (MODEL *)my_malloc(sizeof(MODEL));

  svm_learn_classification(c_docs, labels, totdocs, totwords, 
      &c_learn_param, &c_kernel_param, NULL, m, alpha_in);

  free(alpha_in);
  free(labels);

  // If need arises to free the data do a deep copy of m and create the ruby object with
  // that data.
  // free(c_docs);
  return Data_Wrap_Struct(klass, 0, model_free, m);

bail:
  free(alpha_in);
  free(labels);
  free(c_docs);
  rb_raise(exception, error_msg, "%s");
}

.new(type, documents_and_lables, learn_params, kernel_params, alphas = nil) ⇒ Object

Learns a model from a set of labeled documents.

Parameters:

  • type, (Symbol)

    what kind of model is this, classification, regression, etc. for now the only valid value is classification.

  • documents_and_lables (Array)

    documents and labels is an array of arrays where each inner array must have two elements, the first, a Document and the second a classification (normally +1 and -1)

  • learn_params (Hash)

    each key of learn_params is a string it that maps to a field of the LEARN_PARM struct in SVMLight

  • kernel_params (Hash)

    each key of kernel_params is a string it that maps to a field of the KERNEL_PARM struct in SVMLight

  • alphas (Array|Nil) (defaults to: nil)

    an array of alpha values

Raises:

  • (ArgumentError)


16
17
18
19
20
# File 'lib/svmredlight/model.rb', line 16

def self.new(type, documents_and_lables, learn_params, kernel_params, alphas = nil )
  raise ArgumentError, "Supporte types are (for now) #{TYPES}" unless TYPES.include? type

  learn_classification(documents_and_lables, learn_params, kernel_params, false, alphas)
end

.read_from_file(pahtofile) ⇒ Object

Will load an existent model from a file

Parameters:

  • pahtofile (String)

    path to the model file



38
39
40
41
42
43
44
45
46
# File 'lib/svmredlight/model.rb', line 38

def self.read_from_file(pahtofile)
  if File.exists?(pahtofile) && File.file?(pahtofile) 
    from_file(pahtofile)

  else

    raise MissingModelFile, "the #{pahtofile} does not exists or is not a file"
  end
end

Instance Method Details

#classify(example) ⇒ Object

Classify, takes an example (instance of Document) and returns its classification



611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
# File 'ext/svmredlight.c', line 611

static VALUE
model_classify_example(VALUE self, VALUE example){
  DOC *ex;
  MODEL *m;
  double result;

  Data_Get_Struct(example, DOC, ex);
  Data_Get_Struct(self, MODEL, m);

  /* Apparently unnecessary code 
   
  if(is_linear(m))
    result = classify_example_linear(m, ex);
  else
  */
  
  result = classify_example(m, ex);

  return rb_float_new((float)result);
}

#maxdiffObject



668
669
670
671
672
673
674
# File 'ext/svmredlight.c', line 668

static VALUE
model_maxdiff(VALUE self){
  MODEL *m;
  Data_Get_Struct(self, MODEL, m);

  return DBL2NUM(m->maxdiff);
}

#support_vectors_countObject



632
633
634
635
636
637
638
# File 'ext/svmredlight.c', line 632

static VALUE
model_support_vectors_count(VALUE self){
  MODEL *m;
  Data_Get_Struct(self, MODEL, m);
 
  return INT2FIX(m->sv_num);
}

#to_file(pahtofile) ⇒ Object



640
641
642
643
644
645
646
647
648
649
650
# File 'ext/svmredlight.c', line 640

static VALUE
model_write_to_file(VALUE self, VALUE pahtofile){
  Check_Type(pahtofile, T_STRING);

  MODEL *m;
  Data_Get_Struct(self, MODEL, m);

  write_model(StringValuePtr(pahtofile), m);

  return Qnil;
}

#total_wordsObject



652
653
654
655
656
657
658
# File 'ext/svmredlight.c', line 652

static VALUE
model_total_words(VALUE self){
  MODEL *m;
  Data_Get_Struct(self, MODEL, m);

  return INT2FIX(m->totwords);
}

#totdocObject



660
661
662
663
664
665
666
# File 'ext/svmredlight.c', line 660

static VALUE
model_totdoc(VALUE self){
  MODEL *m;
  Data_Get_Struct(self, MODEL, m);

  return INT2FIX(m->totdoc);
}

#write_to_file(pahtofile) ⇒ Object

Will create a file containing the model info, the model info can be turn back into a model by using Model.read_from_file

Parameters:

  • pahtofile (String)


53
54
55
56
57
58
59
60
61
62
63
# File 'lib/svmredlight/model.rb', line 53

def write_to_file(pahtofile)
  dir = File.dirname(pahtofile)

  if File.directory?(dir) && File.writable?(dir)
    to_file(pahtofile)

  else
    raise ModelWriteError, "impossible to write #{pahtofile}" 

  end
end