Class: Stevedore::StevedoreCsvRow

Inherits:
StevedoreBlob show all
Defined in:
lib/parsers/stevedore_csv_row.rb

Instance Attribute Summary collapse

Attributes inherited from StevedoreBlob

#extra

Instance Method Summary collapse

Methods inherited from StevedoreBlob

#analyze!, new_from_tika

Constructor Details

#initialize(title, text, row_num, download_url, whole_row = {}) ⇒ StevedoreCsvRow

Returns a new instance of StevedoreCsvRow.



6
7
8
9
10
11
12
# File 'lib/parsers/stevedore_csv_row.rb', line 6

def initialize(title, text, row_num, download_url, whole_row={})
  self.title = title || download_url
  self.text = text
  self.download_url = download_url
  self.whole_row = whole_row
  self.row_num = row_num
end

Instance Attribute Details

#download_urlObject

Returns the value of attribute download_url.



5
6
7
# File 'lib/parsers/stevedore_csv_row.rb', line 5

def download_url
  @download_url
end

#row_numObject

Returns the value of attribute row_num.



5
6
7
# File 'lib/parsers/stevedore_csv_row.rb', line 5

def row_num
  @row_num
end

#textObject

Returns the value of attribute text.



5
6
7
# File 'lib/parsers/stevedore_csv_row.rb', line 5

def text
  @text
end

#titleObject

Returns the value of attribute title.



5
6
7
# File 'lib/parsers/stevedore_csv_row.rb', line 5

def title
  @title
end

#whole_rowObject

Returns the value of attribute whole_row.



5
6
7
# File 'lib/parsers/stevedore_csv_row.rb', line 5

def whole_row
  @whole_row
end

Instance Method Details

#clean_textObject



14
15
16
# File 'lib/parsers/stevedore_csv_row.rb', line 14

def clean_text
  @clean_text ||= text.gsub(/<\/?[^>]+>/, '') # removes all tags
end

#to_hashObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/parsers/stevedore_csv_row.rb', line 18

def to_hash
  {
    "sha1" => Digest::SHA1.hexdigest(download_url + row_num.to_s),        
    "title" => title.to_s,
    "source_url" => download_url.to_s,
    "file" => {
      "title" => title.to_s,
      "file" => clean_text.to_s
    },
    "analyzed" => {
      "body" => clean_text.to_s,
      "metadata" => {
        "Content-Type" => "text/plain"
      }.merge(  whole_row.to_h  )
    },
    "_updatedAt" => DateTime.now      
  }
end