Class: Iudex::Worker::FilterChainFactory
- Inherits:
-
Filter::Core::FilterChainFactory
- Object
- Filter::Core::FilterChainFactory
- Iudex::Worker::FilterChainFactory
- Includes:
- BARC, CharDetector, Core, Core::Filters, DA::Filters::FactoryHelper, Filter::Core, HTML::Filters::FactoryHelper, ROME, SimHash::Filters::FactoryHelper, FetchHelper
- Defined in:
- lib/iudex-worker/filter_chain_factory.rb
Instance Attribute Summary collapse
-
#data_source ⇒ Object
Returns the value of attribute data_source.
-
#executor ⇒ Object
Returns the value of attribute executor.
-
#http_client ⇒ Object
Returns the value of attribute http_client.
-
#visit_counter ⇒ Object
Returns the value of attribute visit_counter.
-
#work_poller ⇒ Object
Returns the value of attribute work_poller.
Instance Method Summary collapse
- #barc_directory ⇒ Object
- #barc_writer ⇒ Object
- #feed_fetcher ⇒ Object
- #feed_post ⇒ Object
- #feed_receiver ⇒ Object
- #feed_ref_new ⇒ Object
- #feed_ref_update ⇒ Object
- #feed_update_keys ⇒ Object
- #feed_updater ⇒ Object
- #filters ⇒ Object
-
#initialize(name) ⇒ FilterChainFactory
constructor
A new instance of FilterChainFactory.
- #last_visit_setter ⇒ Object
- #listeners ⇒ Object
- #page_fetcher ⇒ Object
- #page_post ⇒ Object
- #page_receiver ⇒ Object
- #page_update_keys ⇒ Object
- #page_updater ⇒ Object
- #ref_common_cleanup ⇒ Object
- #ref_html_filters ⇒ Object
- #setup_reporters ⇒ Object
- #type_map ⇒ Object
- #type_switch(tmap = type_map) ⇒ Object
Methods included from FetchHelper
#accept_header, #accept_list, #call_if, #create_content_fetcher, #feed_mime_types, #http_request_headers, #http_user_agent, #page_mime_types
Constructor Details
#initialize(name) ⇒ FilterChainFactory
Returns a new instance of FilterChainFactory.
63 64 65 66 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 63 def initialize( name ) super setup_reporters end |
Instance Attribute Details
#data_source ⇒ Object
Returns the value of attribute data_source.
58 59 60 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 58 def data_source @data_source end |
#executor ⇒ Object
Returns the value of attribute executor.
60 61 62 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 60 def executor @executor end |
#http_client ⇒ Object
Returns the value of attribute http_client.
57 58 59 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 57 def http_client @http_client end |
#visit_counter ⇒ Object
Returns the value of attribute visit_counter.
59 60 61 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 59 def visit_counter @visit_counter end |
#work_poller ⇒ Object
Returns the value of attribute work_poller.
61 62 63 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 61 def work_poller @work_poller end |
Instance Method Details
#barc_directory ⇒ Object
178 179 180 181 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 178 def bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) ) bdir end |
#barc_writer ⇒ Object
172 173 174 175 176 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 172 def bw = BARCWriter.new( ) bw.do_compress = true bw end |
#feed_fetcher ⇒ Object
92 93 94 95 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 92 def feed_fetcher [ create_content_fetcher( :types => :feed_mime_types, :filters => :feed_receiver ) ] end |
#feed_post ⇒ Object
136 137 138 139 140 141 142 143 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 136 def feed_post [ UHashMDCSetter.new, ref_common_cleanup, Prioritizer.new( "feed-post", :constant => 30, :visiting_now => true ), last_visit_setter ] end |
#feed_receiver ⇒ Object
102 103 104 105 106 107 108 109 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 102 def feed_receiver [ RedirectHandler.new, Revisitor.new( visit_counter ), RomeFeedParser.new, DefaultFilter.new, DateChangeFilter.new( false ), feed_updater ] end |
#feed_ref_new ⇒ Object
119 120 121 122 123 124 125 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 119 def feed_ref_new [ UHashMDCSetter.new, ref_common_cleanup, Prioritizer.new( "feed-ref-new", :constant => 50, :min_next => 0.0 ) ] end |
#feed_ref_update ⇒ Object
127 128 129 130 131 132 133 134 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 127 def feed_ref_update [ UHashMDCSetter.new, DateChangeFilter.new( true ), ref_common_cleanup, Prioritizer.new( "feed-ref-update", :constant => 10, :min_next => 0.0 ) ] end |
#feed_update_keys ⇒ Object
159 160 161 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 159 def feed_update_keys page_update_keys + [ :title, :summary, :content ] end |
#feed_updater ⇒ Object
111 112 113 114 115 116 117 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 111 def feed_updater create_update_filter( :fields => feed_update_keys, :on_content => :feed_post, :on_referer => :feed_post, :on_ref_update => :feed_ref_update, :on_ref_new => :feed_ref_new ) end |
#filters ⇒ Object
72 73 74 75 76 77 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 72 def filters [ UHashMDCSetter.new, DefaultFilter.new, super, type_switch ] end |
#last_visit_setter ⇒ Object
209 210 211 212 213 214 215 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 209 def last_visit_setter resv = work_poller && work_poller.reserve? inst = work_poller && work_poller.instance [ Copier.new( *keys( :visit_start, :last_visit ) ), ( Setter.new( :reserved.to_k, nil ) if resv ), ( Setter.new( :instance.to_k, inst ) if inst ) ] end |
#listeners ⇒ Object
79 80 81 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 79 def listeners super + [ MDCUnsetter.new( "uhash" ) ] end |
#page_fetcher ⇒ Object
97 98 99 100 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 97 def page_fetcher [ create_content_fetcher( :types => :page_mime_types, :filters => :page_receiver ) ] end |
#page_post ⇒ Object
189 190 191 192 193 194 195 196 197 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 189 def page_post [ UHashMDCSetter.new, , # Not run in 302 referer case, since no SOURCE. Prioritizer.new( "page-post", :constant => 0, :min_next => ( 30 * 60.0 ), :visiting_now => true ), last_visit_setter ] end |
#page_receiver ⇒ Object
163 164 165 166 167 168 169 170 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 163 def page_receiver [ RedirectHandler.new, Revisitor.new( visit_counter ), CharDetectFilter.new, html_clean_filters( :source ), simhash_generator, page_updater ] end |
#page_update_keys ⇒ Object
199 200 201 202 203 204 205 206 207 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 199 def page_update_keys [ :uhash, :domain, :url, :type, ( :reserved if work_poller && work_poller.reserve? ), ( :instance if work_poller && work_poller.instance ), :ref_pub_date, :pub_date, :priority, :last_visit, :next_visit_after, :status, :etag, :reason, :referer, :referent, :cache_file, :cache_file_offset, :simhash ].compact end |
#page_updater ⇒ Object
183 184 185 186 187 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 183 def page_updater create_update_filter( :fields => page_update_keys, :on_content => :page_post, :on_referer => :page_post ) end |
#ref_common_cleanup ⇒ Object
145 146 147 148 149 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 145 def ref_common_cleanup [ ref_html_filters, TextCtrlWSFilter.new( :title.to_k ), FutureDateFilter.new( :pub_date.to_k ) ] end |
#ref_html_filters ⇒ Object
151 152 153 154 155 156 157 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 151 def ref_html_filters [ html_clean_filters( :title ), html_clean_filters( :summary ), html_clean_filters( :content ), html_write_filter( :summary ), html_write_filter( :content ) ] end |
#setup_reporters ⇒ Object
68 69 70 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 68 def setup_reporters # Use default, preserved for overrides end |
#type_map ⇒ Object
83 84 85 86 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 83 def type_map { "FEED" => [ feed_fetcher, :main ], "PAGE" => [ page_fetcher, :main ] } end |
#type_switch(tmap = type_map) ⇒ Object
88 89 90 |
# File 'lib/iudex-worker/filter_chain_factory.rb', line 88 def type_switch( tmap = type_map ) create_switch( :type.to_k, tmap ) end |