Module: PacerXml::Sample

Defined in:: lib/pacer-xml/sample.rb

Class Method Summary collapse

.cleanup(fn = nil) ⇒ Object
.importer(graph = nil, args = {}, &block) ⇒ Object

Sample of using the xml import function with some advanced options to clean up the resulting graph.
.load_100(*args, &block) ⇒ Object

Will actually load 101.
.load_100_software(*args) ⇒ Object
.load_100_with_text(graph = nil, args = {}, &block) ⇒ Object
.load_all(graph = nil, args = {}, &block) ⇒ Object

Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.
.load_all_software(*args) ⇒ Object
.load_all_with_text(graph = nil, args = {}, &block) ⇒ Object
.path(args) ⇒ Object
.structure(g) ⇒ Object
.structure!(g, fn = 'patent-structure.graphml') ⇒ Object
.url(args) ⇒ Object
.xml(args, &block) ⇒ Object

Class Method Details

.cleanup(fn = nil) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 125

def cleanup(fn = nil)
  fn ||= a_week
  name, week = fn.split '_'
  Dir["/tmp/#{name}*"].each { |f| File.delete f }
end

.importer(graph = nil, args = {}, &block) ⇒ `Object`

Sample of using the xml import function with some advanced options to clean up the resulting graph.

Import can successfully be run with no options specified, but this patent xml is particularly hairy.

# File 'lib/pacer-xml/sample.rb', line 88

def importer(graph = nil, args = {}, &block)
  html = [:abstract, :description]
  with_body = ['claim-text']
  rename = {
    'classification-national' => 'class',
    'assistant-examiner' => 'examiner',
    'primary-examiner' => 'examiner',
    'us-term-of-grant' => 'term',
    'addressbook' => 'entity',
    'document-id' => 'document',
    'us-related-documents' => 'related-document',
    'us-patent-grant' => 'patent-version',
    'us-bibliographic-data-grant' => 'patent',
    "us-field-of-classification-search" => 'possible-class'
  }
  skip = Set['classification-ipcr']
  skip_cache = Set['figures', 'figure']
  cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
  graph ||= Pacer.tg
  graph.create_key_index :type, :vertex
  start_time = Time.now
  n = 0
  xml_route = xml(args, &block)
  unless args[:silent]
    xml_route = xml_route.process do
      n += 1
      puts "\n       #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
    end
  end
  xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
end

.load_100(*args, &block) ⇒ `Object`

Will actually load 101. To avoid this side-effect of prefetching, the route should be defined as: xml_route.limit(100).import(…)

# File 'lib/pacer-xml/sample.rb', line 9

def load_100(*args, &block)
  i = importer(*args, &block).limit(100)
  i.run!
  i.graph
end

.load_100_software(*args) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 31

def load_100_software(*args)
  load_100_with_text(*args) do |xml_documents|
    xml_documents.select do |raw_xml|
      raw_xml =~ /software/i
    end
  end
end

.load_100_with_text(graph = nil, args = {}, &block) ⇒ `Object`



15
16
17

# File 'lib/pacer-xml/sample.rb', line 15

def load_100_with_text(graph = nil, args = {}, &block)
  load_100 graph, args.merge(source: :full_text), &block
end

.load_all(graph = nil, args = {}, &block) ⇒ `Object`

Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.

Alternatively, To start the JVM with more memory, try: bundle exec jruby -J-Xmx2g -S irb

# File 'lib/pacer-xml/sample.rb', line 45

def load_all(graph = nil, args = {}, &block)
  require 'pacer-neo4j'
  n = Time.now.to_i % 1000000
  graph ||= Pacer.neo4j "sample.#{n}.graph"
  i = importer(graph, args, &block)
  if args[:thread]
    t = Thread.new do
      begin
        i.run!
      rescue Exception => e
        pp e
        pp e.backtrace
      end
    end
    t[:graph] = graph
    t
  else
    i
  end
end

.load_all_software(*args) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 23

def load_all_software(*args)
  load_all_with_text(*args) do |xml_documents|
    xml_documents.select do |raw_xml|
      raw_xml =~ /software/i
    end
  end
end

.load_all_with_text(graph = nil, args = {}, &block) ⇒ `Object`



19
20
21

# File 'lib/pacer-xml/sample.rb', line 19

def load_all_with_text(graph = nil, args = {}, &block)
  load_all graph, args.merge(source: :full_text), &block
end

.path(args) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 131

def path(args)
  if args[:path]
    args[:path]
  else
    "/tmp/#{patent_file(args).sub(/_wk\d+/, '')}.xml"
  end
end

.structure(g) ⇒ `Object`



66
67
68

# File 'lib/pacer-xml/sample.rb', line 66

def structure(g)
  Pacer::Utils::GraphAnalysis.structure g
end

.structure!(g, fn = 'patent-structure.graphml') ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 70

def structure!(g, fn = 'patent-structure.graphml')
  s = structure g
  if fn
    e = Pacer::Utils::YFilesExport.new
    e.vertex_label = s.vertex_name
    e.edge_label = s.edge_name
    e.export s, fn
    puts
    puts "Wrote #{ fn }"
  end
  s
end

.url(args) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 139

def url(args)
  if args[:url]
    args[:url]
  elsif args[:path]
    nil
  elsif args[:source] == :full_text
    "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
  else
    "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
  end
end

.xml(args, &block) ⇒ `Object`

# File 'lib/pacer-xml/sample.rb', line 120

def xml(args, &block)
  path = download_patent_grant args
  Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
end

Module: PacerXml::Sample

Class Method Summary collapse

Class Method Details

.cleanup(fn = nil) ⇒ Object

.importer(graph = nil, args = {}, &block) ⇒ Object

.load_100(*args, &block) ⇒ Object

.load_100_software(*args) ⇒ Object

.load_100_with_text(graph = nil, args = {}, &block) ⇒ Object

.load_all(graph = nil, args = {}, &block) ⇒ Object

.load_all_software(*args) ⇒ Object

.load_all_with_text(graph = nil, args = {}, &block) ⇒ Object

.path(args) ⇒ Object

.structure(g) ⇒ Object

.structure!(g, fn = 'patent-structure.graphml') ⇒ Object

.url(args) ⇒ Object

.xml(args, &block) ⇒ Object