Module: Document::Corpus

Defined in:
lib/rbbt/document/corpus.rb,
lib/rbbt/document/corpus/pubmed.rb

Constant Summary collapse

PUBMED_NAMESPACE =
"PMID"

Class Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.claimsObject

Returns the value of attribute claims.



62
63
64
# File 'lib/rbbt/document/corpus.rb', line 62

def claims
  @claims
end

Class Method Details

.claim(namespace, &block) ⇒ Object



63
64
65
66
# File 'lib/rbbt/document/corpus.rb', line 63

def claim(namespace, &block)
  @claims = {}
  @claims[namespace.to_s] = block
end

.setup(corpus) ⇒ Object



6
7
8
9
10
11
12
# File 'lib/rbbt/document/corpus.rb', line 6

def self.setup(corpus)
  corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
  corpus.extend Document::Corpus unless Document::Corpus === corpus
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
  corpus.close
  corpus
end

Instance Method Details

#[](*args) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/rbbt/document/corpus.rb', line 37

def [](*args)
  docid, *rest = args

  res = self.read_and_close do
    super(*args)
  end
  
  res.force_encoding(Encoding.default_external) if res 
  return res if args.length > 1

  namespace, id, type  = docid.split(":")

  if res.nil?
    if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
      res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
    end
  end

  res.force_encoding(Encoding.default_external) if res 
  Document.setup(res, namespace, id, type, self) unless res.nil?
  
  res
end

#add_document(document) ⇒ Object



14
15
16
17
18
19
20
21
22
# File 'lib/rbbt/document/corpus.rb', line 14

def add_document(document)
  docid = document.docid
  self.read_and_close do
    return self[docid] if self.include?(docid)
  end
  self.write_and_close do
    self[docid] = document
  end
end

#add_pmid(pmid, type = :title_and_abstract, update = false) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/rbbt/document/corpus/pubmed.rb', line 5

def add_pmid(pmid, type = :title_and_abstract, update = false)
  type = :title_and_abstract if type.nil?

  if ! (update || Array === pmid)
    id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
    documents = self.documents(id)
    return documents.first if documents.any?
  end

  pmids = Array === pmid ? pmid : [pmid]
  type = nil if String === type and type.empty?

  res = PubMed.get_article(pmids).collect do |pmid, article|
    document = if type.to_sym == :abstract
                 Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
               elsif type.to_sym == :title
                 Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
               elsif type.to_sym == :title_and_abstract
                 title = article.title
                 abstract = article.abstract

                 if title.nil? || title == ""
                   text = article.abstract
                   text = "" if text.nil?
                 else
                   title = title + "." unless title.end_with?(".")

                   text = title + " " + abstract if abstract && ! abstract.empty?
                 end

                 Document.setup(text, PUBMED_NAMESPACE, pmid, type.to_sym, self)
               else
                 raise "No FullText available for #{ pmid }" if article.full_text.nil?
                 Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
               end
    Log.debug "Loading pmid #{pmid}"
    add_document(document) if document
    document
  end

  if Array === pmid
    corpus = res.first.corpus if res.first
    Document.setup(res, :corpus => corpus)
  else
    res = res.first
  end

  res
end

#add_pubmed_query(query, max = 3000, type = nil) ⇒ Object



55
56
57
58
# File 'lib/rbbt/document/corpus/pubmed.rb', line 55

def add_pubmed_query(query, max = 3000, type = nil)
  pmids = PubMed.query(query, max)
  add_pmid(pmids, type)
end

#docids(*prefix) ⇒ Object



24
25
26
27
28
29
30
31
# File 'lib/rbbt/document/corpus.rb', line 24

def docids(*prefix)
  prefix = prefix * ":" 
  prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
  docids = self.read_and_close do
    prefix == "all" ? self.keys : self.prefix(prefix)
  end
  DocID.setup(docids, :corpus => self)
end

#documents(*prefix) ⇒ Object



33
34
35
# File 'lib/rbbt/document/corpus.rb', line 33

def documents(*prefix)
  self.docids(*prefix).document
end