Module: RDig

Defined in:
lib/rdig.rb,
lib/rdig/index.rb,
lib/rdig/search.rb,
lib/rdig/crawler.rb,
lib/rdig/documents.rb,
lib/rdig/highlight.rb,
lib/rdig/url_filters.rb,
lib/rdig/content_extractors.rb,
lib/rdig/content_extractors/doc.rb,
lib/rdig/content_extractors/pdf.rb,
lib/rdig/content_extractors/hpricot.rb

Overview

See README for basic usage information

Defined Under Namespace

Modules: ContentExtractors, Index, Search, UrlFilters Classes: Application, Crawler, Document, ETagFilter, FileDocument, HttpDocument

Class Method Summary collapse

Class Method Details

.applicationObject



90
91
92
# File 'lib/rdig.rb', line 90

def application
  @application ||= Application.new
end

.configurationObject Also known as: config

RDig configuration

may be used with a block:

RDig.configuration do |config| ...

see doc/examples/config.rb for a commented example configuration



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/rdig.rb', line 104

def configuration
  if block_given?
    yield configuration
  else
    @config ||= OpenStruct.new(
      :log_file  => '/tmp/rdig.log',
      :log_level => :warn,
      :crawler           => OpenStruct.new(
        :start_urls        => [ "http://localhost:3000/" ],
        :include_hosts     => [ "localhost" ],
        :include_documents => nil,
        :exclude_documents => nil,
        :index_document    => nil,
        :num_threads       => 2,
        :max_redirects     => 5,
        :max_depth         => nil,
        :wait_before_leave => 10,
        :http_proxy        => nil,
        :http_proxy_user   => nil,
        :http_proxy_pass   => nil,
        :normalize_uri => OpenStruct.new(
          :index_document => nil,
          :remove_trailing_slash => nil
        )
      ),
      :content_extraction  => OpenStruct.new(
        # settings for html content extraction (hpricot)
        :hpricot      => OpenStruct.new(
          # css selector for the element containing the page title
          :title_tag_selector => 'title', 
          # might also be a proc returning either an element or a string:
          # :title_tag_selector => lambda { |hpricot_doc| ... }
          :content_tag_selector => 'body'
          # might also be a proc returning either an element or a string:
          # :content_tag_selector => lambda { |hpricot_doc| ... }
        )
      ),
      :index                 => OpenStruct.new( 
        :path                => "index/", 
        :create              => true,
        :handle_parse_errors => true,
        :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
        :or_default          => false,
        :default_field       => '*'
      )
    )
  end
end

.create_loggerObject



162
163
164
165
166
# File 'lib/rdig.rb', line 162

def create_logger
  l = Logger.new(RDig.config.log_file)
  l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
  return l
end

.filter_chainObject

Filter chains are used by the crawler to limit the set of documents being indexed. There are two chains - one for http, and one for file system crawling. Each document has to survive all filters in the relevant chain to get indexed.



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/rdig.rb', line 67

def filter_chain
  @filter_chain ||= {
    # filter chain for http crawling
    :http => [
      :scheme_filter_http,
      :fix_relative_uri,
      { :normalize_uri => :normalize_uri },
      { RDig::UrlFilters::DepthFilter => :max_depth },
      { :hostname_filter => :include_hosts },
      { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
      { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
      RDig::UrlFilters::VisitedUrlFilter
    ],
    # filter chain for file system crawling
    :file => [
      :scheme_filter_file,
      { RDig::UrlFilters::PathInclusionFilter => :include_documents },
      { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
    ]
  }

end

.loggerObject



154
155
156
# File 'lib/rdig.rb', line 154

def logger
  @logger ||= create_logger
end

.logger=(log) ⇒ Object



158
159
160
# File 'lib/rdig.rb', line 158

def logger=(log)
  @logger = log
end

.open_uri_http_optionsObject

returns http options for open_uri if configured



169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/rdig.rb', line 169

def open_uri_http_options
  unless RDig::configuration.crawler.open_uri_http_options
    opts = {}
    if RDig::configuration.crawler.http_proxy
      opts[:proxy] = RDig::configuration.crawler.http_proxy
      if user = RDig::configuration.crawler.http_proxy_user
        pass = RDig::configuration.crawler.http_proxy_pass
        opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
      end
    end
    RDig::configuration.crawler.open_uri_http_options = opts
  end
  return RDig::configuration.crawler.open_uri_http_options
end

.searcherObject



94
95
96
# File 'lib/rdig.rb', line 94

def searcher
  @searcher ||= Search::Searcher.new(config.index)
end