Module: PacerXml::Sample
- Defined in:
- lib/pacer-xml/sample.rb
Class Method Summary collapse
- .cleanup(fn = nil) ⇒ Object
-
.importer(graph = nil, args = {}, &block) ⇒ Object
Sample of using the xml import function with some advanced options to clean up the resulting graph.
-
.load_100(*args, &block) ⇒ Object
Will actually load 101.
- .load_100_software(*args) ⇒ Object
- .load_100_with_text(graph = nil, args = {}, &block) ⇒ Object
-
.load_all(graph = nil, args = {}, &block) ⇒ Object
Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.
- .load_all_software(*args) ⇒ Object
- .load_all_with_text(graph = nil, args = {}, &block) ⇒ Object
- .path(args) ⇒ Object
- .structure(g) ⇒ Object
- .structure!(g, fn = 'patent-structure.graphml') ⇒ Object
- .url(args) ⇒ Object
- .xml(args, &block) ⇒ Object
Class Method Details
.cleanup(fn = nil) ⇒ Object
125 126 127 128 129 |
# File 'lib/pacer-xml/sample.rb', line 125 def cleanup(fn = nil) fn ||= a_week name, week = fn.split '_' Dir["/tmp/#{name}*"].each { |f| File.delete f } end |
.importer(graph = nil, args = {}, &block) ⇒ Object
Sample of using the xml import function with some advanced options to clean up the resulting graph.
Import can successfully be run with no options specified, but this patent xml is particularly hairy.
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/pacer-xml/sample.rb', line 88 def importer(graph = nil, args = {}, &block) html = [:abstract, :description] with_body = ['claim-text'] rename = { 'classification-national' => 'class', 'assistant-examiner' => 'examiner', 'primary-examiner' => 'examiner', 'us-term-of-grant' => 'term', 'addressbook' => 'entity', 'document-id' => 'document', 'us-related-documents' => 'related-document', 'us-patent-grant' => 'patent-version', 'us-bibliographic-data-grant' => 'patent', "us-field-of-classification-search" => 'possible-class' } skip = Set['classification-ipcr'] skip_cache = Set['figures', 'figure'] cache = { stats: true, skip: skip_cache }.merge(args.fetch(:cache, {})) graph ||= Pacer.tg graph.create_key_index :type, :vertex start_time = Time.now n = 0 xml_route = xml(args, &block) unless args[:silent] xml_route = xml_route.process do n += 1 puts "\n #{ n } patents in #{ Time.now - start_time }s" if n % 100 == 0 end end xml_route.import(graph, html: html, skip: skip, rename: rename, cache: cache, with_body: with_body) end |
.load_100(*args, &block) ⇒ Object
Will actually load 101. To avoid this side-effect of prefetching, the route should be defined as: xml_route.limit(100).import(…)
9 10 11 12 13 |
# File 'lib/pacer-xml/sample.rb', line 9 def load_100(*args, &block) i = importer(*args, &block).limit(100) i.run! i.graph end |
.load_100_software(*args) ⇒ Object
31 32 33 34 35 36 37 |
# File 'lib/pacer-xml/sample.rb', line 31 def load_100_software(*args) load_100_with_text(*args) do |xml_documents| xml_documents.select do |raw_xml| raw_xml =~ /software/i end end end |
.load_100_with_text(graph = nil, args = {}, &block) ⇒ Object
15 16 17 |
# File 'lib/pacer-xml/sample.rb', line 15 def load_100_with_text(graph = nil, args = {}, &block) load_100 graph, args.merge(source: :full_text), &block end |
.load_all(graph = nil, args = {}, &block) ⇒ Object
Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.
Alternatively, To start the JVM with more memory, try: bundle exec jruby -J-Xmx2g -S irb
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/pacer-xml/sample.rb', line 45 def load_all(graph = nil, args = {}, &block) require 'pacer-neo4j' n = Time.now.to_i % 1000000 graph ||= Pacer.neo4j "sample.#{n}.graph" i = importer(graph, args, &block) if args[:thread] t = Thread.new do begin i.run! rescue Exception => e pp e pp e.backtrace end end t[:graph] = graph t else i end end |
.load_all_software(*args) ⇒ Object
23 24 25 26 27 28 29 |
# File 'lib/pacer-xml/sample.rb', line 23 def load_all_software(*args) load_all_with_text(*args) do |xml_documents| xml_documents.select do |raw_xml| raw_xml =~ /software/i end end end |
.load_all_with_text(graph = nil, args = {}, &block) ⇒ Object
19 20 21 |
# File 'lib/pacer-xml/sample.rb', line 19 def load_all_with_text(graph = nil, args = {}, &block) load_all graph, args.merge(source: :full_text), &block end |
.path(args) ⇒ Object
131 132 133 134 135 136 137 |
# File 'lib/pacer-xml/sample.rb', line 131 def path(args) if args[:path] args[:path] else "/tmp/#{patent_file(args).sub(/_wk\d+/, '')}.xml" end end |
.structure(g) ⇒ Object
66 67 68 |
# File 'lib/pacer-xml/sample.rb', line 66 def structure(g) Pacer::Utils::GraphAnalysis.structure g end |
.structure!(g, fn = 'patent-structure.graphml') ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/pacer-xml/sample.rb', line 70 def structure!(g, fn = 'patent-structure.graphml') s = structure g if fn e = Pacer::Utils::YFilesExport.new e.vertex_label = s.vertex_name e.edge_label = s.edge_name e.export s, fn puts puts "Wrote #{ fn }" end s end |
.url(args) ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/pacer-xml/sample.rb', line 139 def url(args) if args[:url] args[:url] elsif args[:path] nil elsif args[:source] == :full_text "http://storage.googleapis.com/patents/grant_full_text/#{patent_year(args)}/#{patent_file(args)}.zip" else "http://storage.googleapis.com/patents/grantbib/#{patent_year(args)}/#{patent_file(args)}.zip" end end |