Module: PacerXml::Sample

Defined in:
lib/pacer-xml/sample.rb

Class Method Summary collapse

Class Method Details

.cleanup(fn = nil) ⇒ Object



125
126
127
128
129
# File 'lib/pacer-xml/sample.rb', line 125

def cleanup(fn = nil)
  fn ||= a_week
  name, week = fn.split '_'
  Dir["/tmp/#{name}*"].each { |f| File.delete f }
end

.importer(graph = nil, args = {}, &block) ⇒ Object

Sample of using the xml import function with some advanced options to clean up the resulting graph.

Import can successfully be run with no options specified, but this patent xml is particularly hairy.



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/pacer-xml/sample.rb', line 88

def importer(graph = nil, args = {}, &block)
  html = [:abstract, :description]
  with_body = ['claim-text']
  rename = {
    'classification-national' => 'class',
    'assistant-examiner' => 'examiner',
    'primary-examiner' => 'examiner',
    'us-term-of-grant' => 'term',
    'addressbook' => 'entity',
    'document-id' => 'document',
    'us-related-documents' => 'related-document',
    'us-patent-grant' => 'patent-version',
    'us-bibliographic-data-grant' => 'patent',
    "us-field-of-classification-search" => 'possible-class'
  }
  skip = Set['classification-ipcr']
  skip_cache = Set['figures', 'figure']
  cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {}))
  graph ||= Pacer.tg
  graph.create_key_index :type, :vertex
  start_time = Time.now
  n = 0
  xml_route = xml(args, &block)
  unless args[:silent]
    xml_route = xml_route.process do
      n += 1
      puts "\n       #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0
    end
  end
  xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body)
end

.load_100(*args, &block) ⇒ Object

Will actually load 101. To avoid this side-effect of prefetching, the route should be defined as: xml_route.limit(100).import(…)



9
10
11
12
13
# File 'lib/pacer-xml/sample.rb', line 9

def load_100(*args, &block)
  i = importer(*args, &block).limit(100)
  i.run!
  i.graph
end

.load_100_software(*args) ⇒ Object



31
32
33
34
35
36
37
# File 'lib/pacer-xml/sample.rb', line 31

def load_100_software(*args)
  load_100_with_text(*args) do |xml_documents|
    xml_documents.select do |raw_xml|
      raw_xml =~ /software/i
    end
  end
end

.load_100_with_text(graph = nil, args = {}, &block) ⇒ Object



15
16
17
# File 'lib/pacer-xml/sample.rb', line 15

def load_100_with_text(graph = nil, args = {}, &block)
  load_100 graph, args.merge(source: :full_text), &block
end

.load_all(graph = nil, args = {}, &block) ⇒ Object

Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.

Alternatively, To start the JVM with more memory, try: bundle exec jruby -J-Xmx2g -S irb



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/pacer-xml/sample.rb', line 45

def load_all(graph = nil, args = {}, &block)
  require 'pacer-neo4j'
  n = Time.now.to_i % 1000000
  graph ||= Pacer.neo4j "sample.#{n}.graph"
  i = importer(graph, args, &block)
  if args[:thread]
    t = Thread.new do
      begin
        i.run!
      rescue Exception => e
        pp e
        pp e.backtrace
      end
    end
    t[:graph] = graph
    t
  else
    i
  end
end

.load_all_software(*args) ⇒ Object



23
24
25
26
27
28
29
# File 'lib/pacer-xml/sample.rb', line 23

def load_all_software(*args)
  load_all_with_text(*args) do |xml_documents|
    xml_documents.select do |raw_xml|
      raw_xml =~ /software/i
    end
  end
end

.load_all_with_text(graph = nil, args = {}, &block) ⇒ Object



19
20
21
# File 'lib/pacer-xml/sample.rb', line 19

def load_all_with_text(graph = nil, args = {}, &block)
  load_all graph, args.merge(source: :full_text), &block
end

.path(args) ⇒ Object



131
132
133
134
135
136
137
# File 'lib/pacer-xml/sample.rb', line 131

def path(args)
  if args[:path]
    args[:path]
  else
    "/tmp/#{patent_file(args).sub(/_wk\d+/, '')}.xml"
  end
end

.structure(g) ⇒ Object



66
67
68
# File 'lib/pacer-xml/sample.rb', line 66

def structure(g)
  Pacer::Utils::GraphAnalysis.structure g
end

.structure!(g, fn = 'patent-structure.graphml') ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/pacer-xml/sample.rb', line 70

def structure!(g, fn = 'patent-structure.graphml')
  s = structure g
  if fn
    e = Pacer::Utils::YFilesExport.new
    e.vertex_label = s.vertex_name
    e.edge_label = s.edge_name
    e.export s, fn
    puts
    puts "Wrote #{ fn }"
  end
  s
end

.url(args) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
# File 'lib/pacer-xml/sample.rb', line 139

def url(args)
  if args[:url]
    args[:url]
  elsif args[:path]
    nil
  elsif args[:source] == :full_text
    "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip"
  else
    "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip"
  end
end

.xml(args, &block) ⇒ Object



120
121
122
123
# File 'lib/pacer-xml/sample.rb', line 120

def xml(args, &block)
  path = download_patent_grant args
  Pacer.xml path, args[:start_chunk_rule], args[:end_chunk_rule], &block
end