Class: ArticleJSON::Import::GoogleDoc::HTML::TextParser

Inherits:
Object
  • Object
show all
Defined in:
lib/article_json/import/google_doc/html/text_parser.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(node:, css_analyzer:) ⇒ TextParser

Returns a new instance of TextParser.

Parameters:



8
9
10
11
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 8

def initialize(node:, css_analyzer:)
  @node = node
  @css_analyzer = css_analyzer
end

Class Method Details

.extract(node:, css_analyzer:) ⇒ Array[ArticleJSON::Elements::Text]

Extract multiple text nodes from a wrapping node The wrapping node is usually a paragraph or caption

Parameters:

Returns:



68
69
70
71
72
73
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 68

def extract(node:, css_analyzer:)
  node.children.map do |child_node|
    next if NodeAnalyzer.new(child_node).empty?
    new(node: child_node, css_analyzer: css_analyzer).element
  end.compact
end

Instance Method Details

#bold?Boolean

Check if the text node is styled as bold

Returns:

  • (Boolean)


24
25
26
27
28
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 24

def bold?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.bold?(@node.attribute('class').value)
end

#contentString

The content of the text node, w/o any markup

Returns:

  • (String)


15
16
17
18
19
20
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 15

def content
  @node.children
    .map { |child| child.name == 'br' ? "\n" : child.inner_text }
    .join('')
    .gsub(/\s*\n\s*/, "\n") # Only keep a single consecutive linebreak
end

#elementArticleJSON::Elements::Text



53
54
55
56
57
58
59
60
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 53

def element
  ArticleJSON::Elements::Text.new(
    content: content,
    bold: bold?,
    italic: italic?,
    href: href
  )
end

#hrefString

A possible link target for the text, otherwise ‘nil` Google redirects (basically all links in a google doc html export) are stripped.

Returns:

  • (String)


42
43
44
45
46
47
48
49
50
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 42

def href
  if @node.name == 'span' &&
      @node.first_element_child&.name == 'a' &&
      @node.first_element_child&.has_attribute?('href')
    strip_google_redirect(
      @node.first_element_child.attribute('href').value
    )
  end
end

#italic?Boolean

Check if the text node is styled as italic

Returns:

  • (Boolean)


32
33
34
35
36
# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 32

def italic?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.italic?(@node.attribute('class').value)
end