Class: ArticleJSON::Import::GoogleDoc::HTML::TextParser

Inherits:

Object

Object
ArticleJSON::Import::GoogleDoc::HTML::TextParser

show all

Defined in:: lib/article_json/import/google_doc/html/text_parser.rb

Class Method Summary collapse

.extract(node:, css_analyzer:) ⇒ Array[ArticleJSON::Elements::Text]

Extract multiple text nodes from a wrapping node The wrapping node is usually a paragraph or caption.

Instance Method Summary collapse

#bold? ⇒ Boolean

Check if the text node is styled as bold.
#content ⇒ String

The content of the text node, w/o any markup.
#element ⇒ ArticleJSON::Elements::Text
#href ⇒ String

A possible link target for the text, otherwise ‘nil` Google redirects (basically all links in a google doc html export) are stripped.
#initialize(node:, css_analyzer:) ⇒ TextParser constructor

A new instance of TextParser.
#italic? ⇒ Boolean

Check if the text node is styled as italic.

Constructor Details

#initialize(node:, css_analyzer:) ⇒ `TextParser`

Returns a new instance of TextParser.

Parameters:

node (Nokogiri::HTML::Node)
css_analyzer (ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 8

def initialize(node:, css_analyzer:)
  @node = node
  @css_analyzer = css_analyzer
end

Class Method Details

.extract(node:, css_analyzer:) ⇒ `Array[ArticleJSON::Elements::Text]`

Extract multiple text nodes from a wrapping node The wrapping node is usually a paragraph or caption

Parameters:

node (Nokogiri::HTML::Node)
css_analyzer (ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer)

Returns:

(Array[ArticleJSON::Elements::Text])

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 68

def extract(node:, css_analyzer:)
  node.children.map do |child_node|
    next if NodeAnalyzer.new(child_node).empty?
    new(node: child_node, css_analyzer: css_analyzer).element
  end.compact
end

Instance Method Details

#bold? ⇒ `Boolean`

Check if the text node is styled as bold

Returns:

(Boolean)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 24

def bold?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.bold?(@node.attribute('class').value)
end

#content ⇒ `String`

The content of the text node, w/o any markup

Returns:

(String)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 15

def content
  @node.children
    .map { |child| child.name == 'br' ? "\n" : child.inner_text }
    .join('')
    .gsub(/\s*\n\s*/, "\n") # Only keep a single consecutive linebreak
end

#element ⇒ `ArticleJSON::Elements::Text`

Returns:

(ArticleJSON::Elements::Text)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 53

def element
  ArticleJSON::Elements::Text.new(
    content: content,
    bold: bold?,
    italic: italic?,
    href: href
  )
end

#href ⇒ `String`

A possible link target for the text, otherwise ‘nil` Google redirects (basically all links in a google doc html export) are stripped.

Returns:

(String)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 42

def href
  if @node.name == 'span' &&
      @node.first_element_child&.name == 'a' &&
      @node.first_element_child&.has_attribute?('href')
    strip_google_redirect(
      @node.first_element_child.attribute('href').value
    )
  end
end

#italic? ⇒ `Boolean`

Check if the text node is styled as italic

Returns:

(Boolean)

# File 'lib/article_json/import/google_doc/html/text_parser.rb', line 32

def italic?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.italic?(@node.attribute('class').value)
end

Class: ArticleJSON::Import::GoogleDoc::HTML::TextParser

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(node:, css_analyzer:) ⇒ TextParser

Class Method Details

.extract(node:, css_analyzer:) ⇒ Array[ArticleJSON::Elements::Text]

Instance Method Details

#bold? ⇒ Boolean

#content ⇒ String

#element ⇒ ArticleJSON::Elements::Text

#href ⇒ String

#italic? ⇒ Boolean

#initialize(node:, css_analyzer:) ⇒ `TextParser`

.extract(node:, css_analyzer:) ⇒ `Array[ArticleJSON::Elements::Text]`

#bold? ⇒ `Boolean`

#content ⇒ `String`

#element ⇒ `ArticleJSON::Elements::Text`

#href ⇒ `String`

#italic? ⇒ `Boolean`