Class: Iudex::Worker::FilterChainFactory
- Inherits:
-
Filter::Core::FilterChainFactory
- Object
- Filter::Core::FilterChainFactory
- Iudex::Worker::FilterChainFactory
- Includes:
- BARC, CharDetector, Core, Core::Filters, DA::Filters::FactoryHelper, Filter::Core, HTML::Filters::FactoryHelper, ROME, SimHash::Filters::FactoryHelper, FetchHelper
- Defined in:
- lib/iudex-worker/filter_chain_factory.rb
Instance Attribute Summary collapse
-
#data_source ⇒ Object
Returns the value of attribute data_source.
-
#executor ⇒ Object
Returns the value of attribute executor.
-
#http_client ⇒ Object
Returns the value of attribute http_client.
-
#visit_counter ⇒ Object
Returns the value of attribute visit_counter.
Instance Method Summary collapse
- #barc_directory ⇒ Object
- #barc_writer ⇒ Object
- #feed_fetcher ⇒ Object
- #feed_post ⇒ Object
- #feed_receiver ⇒ Object
- #feed_ref_new ⇒ Object
- #feed_ref_update ⇒ Object
- #feed_update_keys ⇒ Object
- #feed_updater ⇒ Object
- #filters ⇒ Object
-
#initialize(name) ⇒ FilterChainFactory
constructor
A new instance of FilterChainFactory.
- #last_visit_setter ⇒ Object
- #listeners ⇒ Object
- #page_fetcher ⇒ Object
- #page_post ⇒ Object
- #page_receiver ⇒ Object
- #page_update_keys ⇒ Object
- #page_updater ⇒ Object
- #ref_common_cleanup ⇒ Object
- #ref_html_filters ⇒ Object
- #setup_reporters ⇒ Object
- #type_map ⇒ Object
- #type_switch(tmap = type_map) ⇒ Object
Methods included from FetchHelper
#accept_header, #accept_list, #call_if, #create_content_fetcher, #feed_mime_types, #http_request_headers, #http_user_agent, #page_mime_types
Constructor Details
#initialize(name) ⇒ FilterChainFactory
Returns a new instance of FilterChainFactory.
62 63 64 65 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 62 def initialize( name ) super setup_reporters end |
Instance Attribute Details
#data_source ⇒ Object
Returns the value of attribute data_source.
58 59 60 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 58 def data_source @data_source end |
#executor ⇒ Object
Returns the value of attribute executor.
60 61 62 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 60 def executor @executor end |
#http_client ⇒ Object
Returns the value of attribute http_client.
57 58 59 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 57 def http_client @http_client end |
#visit_counter ⇒ Object
Returns the value of attribute visit_counter.
59 60 61 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 59 def visit_counter @visit_counter end |
Instance Method Details
#barc_directory ⇒ Object
177 178 179 180 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 177 def bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) ) bdir end |
#barc_writer ⇒ Object
171 172 173 174 175 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 171 def bw = BARCWriter.new( ) bw.do_compress = true bw end |
#feed_fetcher ⇒ Object
91 92 93 94 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 91 def feed_fetcher [ create_content_fetcher( :types => :feed_mime_types, :filters => :feed_receiver ) ] end |
#feed_post ⇒ Object
135 136 137 138 139 140 141 142 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 135 def feed_post [ UHashMDCSetter.new, ref_common_cleanup, Prioritizer.new( "feed-post", :constant => 30, :visiting_now => true ), last_visit_setter ] end |
#feed_receiver ⇒ Object
101 102 103 104 105 106 107 108 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 101 def feed_receiver [ RedirectHandler.new, Revisitor.new( visit_counter ), RomeFeedParser.new, DefaultFilter.new, DateChangeFilter.new( false ), feed_updater ] end |
#feed_ref_new ⇒ Object
118 119 120 121 122 123 124 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 118 def feed_ref_new [ UHashMDCSetter.new, ref_common_cleanup, Prioritizer.new( "feed-ref-new", :constant => 50, :min_next => 0.0 ) ] end |
#feed_ref_update ⇒ Object
126 127 128 129 130 131 132 133 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 126 def feed_ref_update [ UHashMDCSetter.new, DateChangeFilter.new( true ), ref_common_cleanup, Prioritizer.new( "feed-ref-update", :constant => 10, :min_next => 0.0 ) ] end |
#feed_update_keys ⇒ Object
158 159 160 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 158 def feed_update_keys page_update_keys + [ :title, :summary, :content ] end |
#feed_updater ⇒ Object
110 111 112 113 114 115 116 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 110 def feed_updater create_update_filter( :fields => feed_update_keys, :on_content => :feed_post, :on_referer => :feed_post, :on_ref_update => :feed_ref_update, :on_ref_new => :feed_ref_new ) end |
#filters ⇒ Object
71 72 73 74 75 76 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 71 def filters [ UHashMDCSetter.new, DefaultFilter.new, super, type_switch ] end |
#last_visit_setter ⇒ Object
206 207 208 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 206 def last_visit_setter Copier.new( *keys( :visit_start, :last_visit ) ) end |
#listeners ⇒ Object
78 79 80 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 78 def listeners super + [ MDCUnsetter.new( "uhash" ) ] end |
#page_fetcher ⇒ Object
96 97 98 99 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 96 def page_fetcher [ create_content_fetcher( :types => :page_mime_types, :filters => :page_receiver ) ] end |
#page_post ⇒ Object
188 189 190 191 192 193 194 195 196 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 188 def page_post [ UHashMDCSetter.new, , # Not run in 302 referer case, since no SOURCE. Prioritizer.new( "page-post", :constant => 0, :min_next => ( 30 * 60.0 ), :visiting_now => true ), last_visit_setter ] end |
#page_receiver ⇒ Object
162 163 164 165 166 167 168 169 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 162 def page_receiver [ RedirectHandler.new, Revisitor.new( visit_counter ), CharDetectFilter.new, html_clean_filters( :source ), simhash_generator, page_updater ] end |
#page_update_keys ⇒ Object
198 199 200 201 202 203 204 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 198 def page_update_keys [ :uhash, :domain, :url, :type, :ref_pub_date, :pub_date, :priority, :last_visit, :next_visit_after, :status, :etag, :reason, :referer, :referent, :cache_file, :cache_file_offset, :simhash ] end |
#page_updater ⇒ Object
182 183 184 185 186 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 182 def page_updater create_update_filter( :fields => page_update_keys, :on_content => :page_post, :on_referer => :page_post ) end |
#ref_common_cleanup ⇒ Object
144 145 146 147 148 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 144 def ref_common_cleanup [ ref_html_filters, TextCtrlWSFilter.new( :title.to_k ), FutureDateFilter.new( :pub_date.to_k ) ] end |
#ref_html_filters ⇒ Object
150 151 152 153 154 155 156 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 150 def ref_html_filters [ html_clean_filters( :title ), html_clean_filters( :summary ), html_clean_filters( :content ), html_write_filter( :summary ), html_write_filter( :content ) ] end |
#setup_reporters ⇒ Object
67 68 69 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 67 def setup_reporters # Use default, preserved for overrides end |
#type_map ⇒ Object
82 83 84 85 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 82 def type_map { "FEED" => [ feed_fetcher, :main ], "PAGE" => [ page_fetcher, :main ] } end |
#type_switch(tmap = type_map) ⇒ Object
87 88 89 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 87 def type_switch( tmap = type_map ) create_switch( :type.to_k, tmap ) end |