Class: Iudex::Worker::FilterChainFactory

Inherits:
Filter::Core::FilterChainFactory
  • Object
show all
Includes:
BARC, CharDetector, Core, Core::Filters, DA::Filters::FactoryHelper, Filter::Core, HTML::Filters::FactoryHelper, ROME, SimHash::Filters::FactoryHelper, FetchHelper
Defined in:
lib/iudex-worker/filter_chain_factory.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from FetchHelper

#accept_header, #accept_list, #call_if, #create_content_fetcher, #feed_mime_types, #http_request_headers, #http_user_agent, #page_mime_types

Constructor Details

#initialize(name) ⇒ FilterChainFactory

Returns a new instance of FilterChainFactory.



63
64
65
66
# File 'lib/iudex-worker/filter_chain_factory.rb', line 63

def initialize( name )
  super
  setup_reporters
end

Instance Attribute Details

#data_sourceObject

Returns the value of attribute data_source.



58
59
60
# File 'lib/iudex-worker/filter_chain_factory.rb', line 58

def data_source
  @data_source
end

#executorObject

Returns the value of attribute executor.



60
61
62
# File 'lib/iudex-worker/filter_chain_factory.rb', line 60

def executor
  @executor
end

#http_clientObject

Returns the value of attribute http_client.



57
58
59
# File 'lib/iudex-worker/filter_chain_factory.rb', line 57

def http_client
  @http_client
end

#visit_counterObject

Returns the value of attribute visit_counter.



59
60
61
# File 'lib/iudex-worker/filter_chain_factory.rb', line 59

def visit_counter
  @visit_counter
end

#work_pollerObject

Returns the value of attribute work_poller.



61
62
63
# File 'lib/iudex-worker/filter_chain_factory.rb', line 61

def work_poller
  @work_poller
end

Instance Method Details

#barc_directoryObject



178
179
180
181
# File 'lib/iudex-worker/filter_chain_factory.rb', line 178

def barc_directory
  bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) )
  bdir
end

#barc_writerObject



172
173
174
175
176
# File 'lib/iudex-worker/filter_chain_factory.rb', line 172

def barc_writer
  bw = BARCWriter.new( barc_directory )
  bw.do_compress = true
  bw
end

#feed_fetcherObject



92
93
94
95
# File 'lib/iudex-worker/filter_chain_factory.rb', line 92

def feed_fetcher
  [ create_content_fetcher( :types => :feed_mime_types,
                            :filters => :feed_receiver ) ]
end

#feed_postObject



136
137
138
139
140
141
142
143
# File 'lib/iudex-worker/filter_chain_factory.rb', line 136

def feed_post
  [ UHashMDCSetter.new,
    ref_common_cleanup,
    Prioritizer.new( "feed-post",
                     :constant => 30,
                     :visiting_now => true ),
    last_visit_setter ]
end

#feed_receiverObject



102
103
104
105
106
107
108
109
# File 'lib/iudex-worker/filter_chain_factory.rb', line 102

def feed_receiver
  [ RedirectHandler.new,
    Revisitor.new( visit_counter ),
    RomeFeedParser.new,
    DefaultFilter.new,
    DateChangeFilter.new( false ),
    feed_updater ]
end

#feed_ref_newObject



119
120
121
122
123
124
125
# File 'lib/iudex-worker/filter_chain_factory.rb', line 119

def feed_ref_new
  [ UHashMDCSetter.new,
    ref_common_cleanup,
    Prioritizer.new( "feed-ref-new",
                     :constant => 50,
                     :min_next => 0.0 ) ]
end

#feed_ref_updateObject



127
128
129
130
131
132
133
134
# File 'lib/iudex-worker/filter_chain_factory.rb', line 127

def feed_ref_update
  [ UHashMDCSetter.new,
    DateChangeFilter.new( true ),
    ref_common_cleanup,
    Prioritizer.new( "feed-ref-update",
                     :constant => 10,
                     :min_next => 0.0 ) ]
end

#feed_update_keysObject



159
160
161
# File 'lib/iudex-worker/filter_chain_factory.rb', line 159

def feed_update_keys
  page_update_keys + [ :title, :summary, :content ]
end

#feed_updaterObject



111
112
113
114
115
116
117
# File 'lib/iudex-worker/filter_chain_factory.rb', line 111

def feed_updater
  create_update_filter( :fields        => feed_update_keys,
                        :on_content    => :feed_post,
                        :on_referer    => :feed_post,
                        :on_ref_update => :feed_ref_update,
                        :on_ref_new    => :feed_ref_new )
end

#filtersObject



72
73
74
75
76
77
# File 'lib/iudex-worker/filter_chain_factory.rb', line 72

def filters
  [ UHashMDCSetter.new,
    DefaultFilter.new,
    super,
    type_switch ]
end

#last_visit_setterObject



209
210
211
212
213
214
215
# File 'lib/iudex-worker/filter_chain_factory.rb', line 209

def last_visit_setter
  resv = work_poller && work_poller.reserve?
  inst = work_poller && work_poller.instance
  [ Copier.new( *keys( :visit_start, :last_visit ) ),
    ( Setter.new( :reserved.to_k, nil )  if resv ),
    ( Setter.new( :instance.to_k, inst ) if inst ) ]
end

#listenersObject



79
80
81
# File 'lib/iudex-worker/filter_chain_factory.rb', line 79

def listeners
  super + [ MDCUnsetter.new( "uhash" ) ]
end

#page_fetcherObject



97
98
99
100
# File 'lib/iudex-worker/filter_chain_factory.rb', line 97

def page_fetcher
  [ create_content_fetcher( :types => :page_mime_types,
                            :filters => :page_receiver ) ]
end

#page_postObject



189
190
191
192
193
194
195
196
197
# File 'lib/iudex-worker/filter_chain_factory.rb', line 189

def page_post
  [ UHashMDCSetter.new,
    barc_writer, # Not run in 302 referer case, since no SOURCE.
    Prioritizer.new( "page-post",
                     :constant => 0,
                     :min_next => ( 30 * 60.0 ),
                     :visiting_now => true ),
    last_visit_setter ]
end

#page_receiverObject



163
164
165
166
167
168
169
170
# File 'lib/iudex-worker/filter_chain_factory.rb', line 163

def page_receiver
  [ RedirectHandler.new,
    Revisitor.new( visit_counter ),
    CharDetectFilter.new,
    html_clean_filters( :source ),
    simhash_generator,
    page_updater ]
end

#page_update_keysObject



199
200
201
202
203
204
205
206
207
# File 'lib/iudex-worker/filter_chain_factory.rb', line 199

def page_update_keys
  [ :uhash, :domain, :url, :type,
    ( :reserved if work_poller && work_poller.reserve? ),
    ( :instance if work_poller && work_poller.instance ),
    :ref_pub_date, :pub_date,
    :priority, :last_visit, :next_visit_after,
    :status, :etag, :reason, :referer, :referent,
    :cache_file, :cache_file_offset, :simhash ].compact
end

#page_updaterObject



183
184
185
186
187
# File 'lib/iudex-worker/filter_chain_factory.rb', line 183

def page_updater
  create_update_filter( :fields     => page_update_keys,
                        :on_content => :page_post,
                        :on_referer => :page_post )
end

#ref_common_cleanupObject



145
146
147
148
149
# File 'lib/iudex-worker/filter_chain_factory.rb', line 145

def ref_common_cleanup
  [ ref_html_filters,
    TextCtrlWSFilter.new( :title.to_k ),
    FutureDateFilter.new( :pub_date.to_k ) ]
end

#ref_html_filtersObject



151
152
153
154
155
156
157
# File 'lib/iudex-worker/filter_chain_factory.rb', line 151

def ref_html_filters
  [ html_clean_filters( :title ),
    html_clean_filters( :summary ),
    html_clean_filters( :content ),
    html_write_filter( :summary ),
    html_write_filter( :content ) ]
end

#setup_reportersObject



68
69
70
# File 'lib/iudex-worker/filter_chain_factory.rb', line 68

def setup_reporters
  # Use default, preserved for overrides
end

#type_mapObject



83
84
85
86
# File 'lib/iudex-worker/filter_chain_factory.rb', line 83

def type_map
  { "FEED" => [ feed_fetcher, :main ],
    "PAGE" => [ page_fetcher, :main ] }
end

#type_switch(tmap = type_map) ⇒ Object



88
89
90
# File 'lib/iudex-worker/filter_chain_factory.rb', line 88

def type_switch( tmap = type_map )
  create_switch( :type.to_k, tmap )
end