Class: Cassiopee::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/cassiopee.rb

Overview

Base class to index and search through a string

Constant Summary collapse

METHOD_DIRECT =
0
METHOD_SUFFIX =
1
FILE_SUFFIX_EXT =
".sfx"
FILE_SUFFIX_POS =
".sfp"
SUFFIXLEN =
'suffix_length'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeCrawler

Returns a new instance of Crawler.


299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/cassiopee.rb', line 299

def initialize
    @useAmbiguity = false
    @ambiguous = nil
			@useCache = false
    @file_suffix = "crawler"
			
			@method = 0
			
			@prev_min_position = 0
			@prev_max_position = 0
		
			
    @suffix = nil
    @suffixmd5 = nil
    @position = 0
    
    @suffixes = Hash.new
    
    @matches = Array.new
    @curmatch = 0
    @use_store = false
    
    @sequence = nil
			
			@comments = Array["#"]
			
			@cache = Cassiopee::CrawlerCache.new

end

Instance Attribute Details

#ambiguousObject

Ambiguity map (Hash)


264
265
266
# File 'lib/cassiopee.rb', line 264

def ambiguous
  @ambiguous
end

#commentsObject

Array of comment characters to skip lines in input sequence file


258
259
260
# File 'lib/cassiopee.rb', line 258

def comments
  @comments
end

#file_suffixObject

Suffix files name/path


252
253
254
# File 'lib/cassiopee.rb', line 252

def file_suffix
  @file_suffix
end

#maxthreadObject

Max number fo threads to use (not yet used)


254
255
256
# File 'lib/cassiopee.rb', line 254

def maxthread
  @maxthread
end

#methodObject

Method for search FORCE or SUFFIX

  • SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)

  • FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search FORCE method does not yet support optimal filters


270
271
272
# File 'lib/cassiopee.rb', line 270

def method
  @method
end

#use_storeObject

Use persistent suffix file ?


256
257
258
# File 'lib/cassiopee.rb', line 256

def use_store
  @use_store
end

#useAmbiguityObject

Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile


250
251
252
# File 'lib/cassiopee.rb', line 250

def useAmbiguity
  @useAmbiguity
end

#useCacheObject

Manage basic cache to store previous match


261
262
263
# File 'lib/cassiopee.rb', line 261

def useCache
  @useCache
end

Instance Method Details

#clearObject

Clear suffixes in memory If using use_store, clear the store too


340
341
342
343
344
345
346
347
348
# File 'lib/cassiopee.rb', line 340

def clear
	@suffixes = Hash.new
			@matches.clear
			@pattern = nil
			@prev_max_position = 0
			@prev_min_position = 0
			@cache.clearCache()
	File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
end

#extractSuffix(start, len) ⇒ Object

Extract un suffix from suffix file based on md5 match


586
587
588
589
590
591
592
593
594
595
596
597
598
# File 'lib/cassiopee.rb', line 586

def extractSuffix(start,len)
	sequence = ''
        begin
        	file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
file.pos = start
					sequence = file.read(len)
        	file.close
        rescue => err
        	puts "Exception: #{err}"
        	return nil
        end
	return sequence
end

#filter(posArray) ⇒ Object

Filter the array of positions with defined position filter


563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
# File 'lib/cassiopee.rb', line 563

def filter(posArray)
	$log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
	if(@min_position==0 && @max_position==0)
		return posArray
	end
	filteredArray = Array.new
	i = 0
	posArray.each do |pos|
		if(i==0)			# First elt of array is match length

			filteredArray << pos
		end
		if(i>0 && pos>=@min_position && pos<=@max_position)
			filteredArray << pos
		end
		i +=1
	end
	return filteredArray
end

#filter_position(min, max) ⇒ Object

Must be called after index creation or load


435
436
437
438
439
440
441
442
443
# File 'lib/cassiopee.rb', line 435

def filter_position(min,max)
    if(!use_store)
		clear()
	end
			@prev_min_position = @min_position
			@prev_max_position = @max_position
	@min_position = min
	@max_position = max
end

#filterCostObject


333
334
335
# File 'lib/cassiopee.rb', line 333

def filterCost
  filterOptimal(1)
end

#filterLengthObject


329
330
331
# File 'lib/cassiopee.rb', line 329

def filterLength
  filterOptimal(0)
end

#indexFile(f) ⇒ Object

Index an input file Clear existing indexes


359
360
361
362
363
364
365
366
367
368
# File 'lib/cassiopee.rb', line 359

def indexFile(f)
 # Parse file, map letters to reduced alphabet
 # Later on, use binary map instead of ascii map
 # Take all suffix, order by length, link to position map on other file
 # Store md5 for easier compare? + 20 bytes per suffix
    @sequence = readSequence(f)
    clear()
    @min_position = 0
    		@max_position = 0
end

#indexString(s) ⇒ Object

Index an input string Clear existing indexes


373
374
375
376
377
378
379
380
381
# File 'lib/cassiopee.rb', line 373

def indexString(s)
    @sequence = s
    File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
    	data.puts(@sequence)
    end
    clear()
    @min_position = 0
    		@max_position = 0
end

#loadAmbiguityFile(f) ⇒ Object

Load ambiguity rules from a file File format should be:

  • A=B,C D=E,F …


390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/cassiopee.rb', line 390

def loadAmbiguityFile(f)
  if(!File.exists?(f))
     $log.error("File "<< f << "does not exists")
	 exit(1)
  end
  @ambiguous = Hash.new
  file = File.new(f, "r")
  while (line = file.gets)
    definition = line.downcase.chomp
	ambdef = definition.split('=')
	ambequal = ambdef[1].split(',')
	@ambiguous[ambdef[0]] = ambequal
  end
  @useAmbiguity = true
  $log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
  file.close

end

#loadIndexObject

Load sequence from a previous index command


411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# File 'lib/cassiopee.rb', line 411

def loadIndex
	seq = ''
	begin
		file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
		while (line = file.gets)
			input = line.downcase.chomp
			seq << input
		end
		file.close
	rescue => err
		$log.error("Exception: #{err}")
		exit()
	end
	@sequence = seq
	clear()
          @min_position = 0
  		@max_position = 0
end

#nextObject

Iterates over matches


602
603
604
605
606
607
608
609
610
# File 'lib/cassiopee.rb', line 602

def next
	if(@curmatch<@matches.length)
		@curmatch = @curmatch + 1
		return @matches[@curmatch-1]
	else
		@curmatch = 0
		return nil
	end
end

#searchApproximate(s, edit) ⇒ Object


494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
# File 'lib/cassiopee.rb', line 494

def searchApproximate(s,edit)
		
	if(edit==0 && !@useAmbiguity) 
		return searchExact(s)
	end
			allowederrors = edit
	if(edit>=0)
	  useHamming = true
	  minmatchsize = s.length
	  maxmatchsize = s.length
			  updateCache(1,edit)
			  @matches = @cache.loadCache()
	else
	  useHamming = false
	  edit = edit * (-1)
      minmatchsize = s.length - edit
      maxmatchsize = s.length + edit
			  updateCache(2,edit)
			  @matches = @cache.loadCache()
    end
			
			if(@matches.length>0)
				return @matches
			end
			
			s = s.downcase
    
			
    #@matches.clear
			@pattern = Digest::MD5.hexdigest(s)
			
			parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s)
    
			return cache?(@matches) unless(method == METHOD_SUFFIX)
			
 
    
	      	
	@suffixes.each do |md5val,posArray|
		if(md5val == SUFFIXLEN)
			next
		end
		if (md5val == @pattern)
			filteredPosArray = filter(posArray)
            match = Array[md5val, 0, filteredPosArray]
$log.debug "Match: " << match.inspect
@matches << match
		    	else
if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)	# Get string

	seq = extractSuffix(posArray[1],posArray[0])
						errors = isApproximateEqual?(seq,s,useHamming,edit)
						
	if(errors>=0)
		filteredPosArray = filter(posArray)
	    match = Array[md5val, errors, filteredPosArray]
		$log.debug "Match: " << match.inspect
		@matches << match
	end
end
        end
	
	end
	
	return cache?(@matches) 
end

#searchExact(s) ⇒ Object

Search exact match


447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
# File 'lib/cassiopee.rb', line 447

def searchExact(s)
		
		if(@useAmbiguity)
		  return searchApproximate(s,0)
		end
		
s = s.downcase

		updateCache(0,0)
		@matches = @cache.loadCache()
		
		if(@matches.length>0)
			return cache?(@matches)
		end
		
		#@matches.clear
		
		@pattern = Digest::MD5.hexdigest(s)
		
		parseSuffixes(@sequence,s.length,s.length,0,s)

		return @matches unless(method == METHOD_SUFFIX)
		
 # Search required length, compare (compare md5?)
 # MD5 = 128 bits, easier to compare for large strings
    
			
			matchsize = @pattern.length
			
    @suffixes.each do |md5val,posArray|
        if (isMatchEqual?(md5val))
            match = Array[md5val, 0, posArray]
$log.debug "Match: " << match.inspect
@matches << match
        end
    end
return cache?(@matches) 

end

#setLogLevel(level) ⇒ Object

Set Logger level


352
353
354
# File 'lib/cassiopee.rb', line 352

def setLogLevel(level)
    $log.level = level
end

#to_posObject


612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
# File 'lib/cassiopee.rb', line 612

def to_pos
	positions = Hash.new
	@matches.each do |match|
	  # match = Array[md5val, errors, posArray]
	  i=0
			  len = 0
	  match[2].each do |pos|
	    if(i==0)
	      len = pos
	    else
	      if(positions.has_key?(pos))
	       posmatch = positions[pos]
	       posmatch << Array[len,match[1]]

	      
	      else
	        posmatch = Array.new
	        posmatch << Array[len,match[1]]
	        positions[pos] = posmatch
	      end
	    end
	    i += 1
	  end
	 
	end
    return positions.sort
end

#to_sObject


640
641
642
# File 'lib/cassiopee.rb', line 640

def to_s
	puts '{ matches: "' << @matches.length << '" }'
end