Class: Corpus
- Inherits:
-
Object
- Object
- Corpus
- Defined in:
- lib/rbbt/corpus/corpus.rb,
lib/rbbt/corpus/sources/pubmed.rb
Constant Summary collapse
- NAMESPACES =
{}
Instance Attribute Summary collapse
-
#corpora_path ⇒ Object
Returns the value of attribute corpora_path.
-
#document_repo ⇒ Object
Returns the value of attribute document_repo.
-
#global_annotations ⇒ Object
Returns the value of attribute global_annotations.
-
#persistence_dir ⇒ Object
Returns the value of attribute persistence_dir.
Instance Method Summary collapse
- #add_document(text, namespace, id, type = nil) ⇒ Object
- #add_pmid(pmid, type = nil) ⇒ Object
- #add_pubmed_query(query, max, type = nil) ⇒ Object
- #docid(docid) ⇒ Object
- #document(namespace, id, type, hash) ⇒ Object
- #exists?(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Boolean
- #find(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Object
- #find_docid(docid) ⇒ Object
-
#initialize(corpora_path = nil) ⇒ Corpus
constructor
A new instance of Corpus.
- #persistence_for(docid) ⇒ Object
Constructor Details
#initialize(corpora_path = nil) ⇒ Corpus
Returns a new instance of Corpus.
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/rbbt/corpus/corpus.rb', line 6 def initialize(corpora_path = nil) @corpora_path = case when corpora_path.nil? Rbbt.corpora when (not Path === corpora_path) Path.setup(corpora_path) else corpora_path end @corpora_path = @corpora_path.find @persistence_dir = File.join(@corpora_path, "annotations") Misc.lock(@persistence_dir) do @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]) @global_annotations.unnamed = true @global_annotations.close end Misc.lock(@corpora_path.document_repo) do @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false end end |
Instance Attribute Details
#corpora_path ⇒ Object
Returns the value of attribute corpora_path.
5 6 7 |
# File 'lib/rbbt/corpus/corpus.rb', line 5 def corpora_path @corpora_path end |
#document_repo ⇒ Object
Returns the value of attribute document_repo.
5 6 7 |
# File 'lib/rbbt/corpus/corpus.rb', line 5 def document_repo @document_repo end |
#global_annotations ⇒ Object
Returns the value of attribute global_annotations.
5 6 7 |
# File 'lib/rbbt/corpus/corpus.rb', line 5 def global_annotations @global_annotations end |
#persistence_dir ⇒ Object
Returns the value of attribute persistence_dir.
5 6 7 |
# File 'lib/rbbt/corpus/corpus.rb', line 5 def persistence_dir @persistence_dir end |
Instance Method Details
#add_document(text, namespace, id, type = nil) ⇒ Object
46 47 48 49 |
# File 'lib/rbbt/corpus/corpus.rb', line 46 def add_document(text, namespace, id, type = nil) hash = Digest::MD5.hexdigest(text) @document_repo.add(text, namespace, id, type, hash) end |
#add_pmid(pmid, type = nil) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/rbbt/corpus/sources/pubmed.rb', line 8 def add_pmid(pmid, type = nil) pmids = Array === pmid ? pmid : [pmid] type = nil if String === type and type.empty? PubMed.get_article(pmids).collect do |pmid, article| if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract) add_document(article.text, :pubmed, pmid, :abstract) else raise "No FullText available for #{ pmid }" if article.pdf_url.nil? add_document(article.full_text, :pubmed, pmid, :fulltext) end end end |
#add_pubmed_query(query, max, type = nil) ⇒ Object
22 23 24 25 |
# File 'lib/rbbt/corpus/sources/pubmed.rb', line 22 def add_pubmed_query(query, max, type = nil) pmids = PubMed.query(query, max) add_pmid(pmids, type) end |
#docid(docid) ⇒ Object
41 42 43 44 |
# File 'lib/rbbt/corpus/corpus.rb', line 41 def docid(docid) raise "Document '#{ docid }' was not found." unless @document_repo.include? docid Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations) end |
#document(namespace, id, type, hash) ⇒ Object
35 36 37 38 39 |
# File 'lib/rbbt/corpus/corpus.rb', line 35 def document(namespace, id, type, hash) docid = [namespace, id, type, hash] * ":" raise "Document '#{ docid }' was not found." unless @document_repo.include? docid Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations) end |
#exists?(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Boolean
63 64 65 |
# File 'lib/rbbt/corpus/corpus.rb', line 63 def exists?(namespace=nil, id = nil, type = nil, hash = nil) find(namespace, id, type, hash).any? end |
#find(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Object
51 52 53 54 55 |
# File 'lib/rbbt/corpus/corpus.rb', line 51 def find(namespace=nil, id = nil, type = nil, hash = nil) @document_repo.find(namespace, id, type, hash).collect{|docid| Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations) } end |
#find_docid(docid) ⇒ Object
57 58 59 60 61 |
# File 'lib/rbbt/corpus/corpus.rb', line 57 def find_docid(docid) @document_repo.find_docid(docid).collect{|docid| Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations) } end |
#persistence_for(docid) ⇒ Object
31 32 33 |
# File 'lib/rbbt/corpus/corpus.rb', line 31 def persistence_for(docid) File.join(persistence_dir, docid) end |