Module: Iudex::HTML::Filters::FactoryHelper

Includes:
Core, Tree, Tree::Filters
Defined in:
lib/iudex-html/factory_helper.rb

Instance Method Summary collapse

Methods included from Tree

parse

Instance Method Details

#html_clean_filters(src_key, tree_key = nil) ⇒ Object

Create html parse and clean filters Expected usage:

PAGE: html_clean_filters( :source  )
FEED: html_clean_filters( :title   )
FEED: html_clean_filters( :summary )
FEED: html_clean_filters( :content )


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/iudex-html/factory_helper.rb', line 35

def html_clean_filters( src_key, tree_key = nil )

  tree_key = "#{src_key}_tree".to_sym unless tree_key
  src_key, tree_key = src_key.to_k, tree_key.to_k

  filters = []
  filters << html_parse_filter( src_key, tree_key )

  #FIXME: PAGE: filters << TitleExtractor.new, or after?

  # FIXME: if src is text, last filter
  # filters << TextCtrlWSFilter.new( ContentKeys::TITLE )

  tfc = TreeFilterChain.new( html_tree_filters )

  filters << HTMLTreeFilter.new( tree_key, tfc,
                                 HTMLTreeFilter::Order::DEPTH_FIRST )

  #FIXME: First block extractor back to text key?

  filters
end

#html_parse_filter(src_key, tree_key = nil) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
# File 'lib/iudex-html/factory_helper.rb', line 69

def html_parse_filter( src_key, tree_key = nil )

  tree_key = "#{src_key}_tree".to_sym unless tree_key
  src_key, tree_key = src_key.to_k, tree_key.to_k

  if( src_key.value_type == ContentSource.java_class )
    HTMLParseFilter.new( src_key, nil, tree_key )
  else
    HTMLParseFilter.new( src_key, tree_key )
  end
end

#html_tree_filtersObject



58
59
60
61
62
63
64
65
66
67
# File 'lib/iudex-html/factory_helper.rb', line 58

def html_tree_filters
  [ XmpToPreConverter.new,    # Before CharactersNormalizer
    CSSDisplayFilter.new,     # Before AttributeCleaner
    AttributeCleaner.new,
    MojiBakeCleaner.new,
    CharactersNormalizer.new,
    EmptyInlineRemover.new,   # Depth
    WordCounter.new,          # Depth; only for count deps?
    WordyCounter.new ]        # Depth; only with cleaners/simhash?
end

#html_write_filter(key1, key2 = nil) ⇒ Object

Expected usage:

FEED: html_write_filter( :summary )


83
84
85
86
87
88
89
90
91
92
# File 'lib/iudex-html/factory_helper.rb', line 83

def html_write_filter( key1, key2 = nil )

  tree_key, out_key = if key2
                        [ key1, key2 ]
                      else
                        [ "#{key1}_tree".to_sym, key1 ]
                      end

  HTMLWriteFilter.new( tree_key.to_k, out_key.to_k )
end