Module: Wikipedia

Defined in:
lib/wikipedia/vandalism_detection/wikitext_extractor.rb,
lib/wikipedia.rb,
lib/wikipedia/vandalism_detection/diff.rb,
lib/wikipedia/vandalism_detection/edit.rb,
lib/wikipedia/vandalism_detection/page.rb,
lib/wikipedia/vandalism_detection/text.rb,
lib/wikipedia/vandalism_detection/version.rb,
lib/wikipedia/vandalism_detection/revision.rb,
lib/wikipedia/vandalism_detection/evaluator.rb,
lib/wikipedia/vandalism_detection/instances.rb,
lib/wikipedia/vandalism_detection/algorithms.rb,
lib/wikipedia/vandalism_detection/classifier.rb,
lib/wikipedia/vandalism_detection/exceptions.rb,
lib/wikipedia/vandalism_detection/word_lists.rb,
lib/wikipedia/vandalism_detection/page_parser.rb,
lib/wikipedia/vandalism_detection/test_dataset.rb,
lib/wikipedia/vandalism_detection/configuration.rb,
lib/wikipedia/vandalism_detection/features/base.rb,
lib/wikipedia/vandalism_detection/word_lists/bad.rb,
lib/wikipedia/vandalism_detection/word_lists/sex.rb,
lib/wikipedia/vandalism_detection/revision_parser.rb,
lib/wikipedia/vandalism_detection/features/weekday.rb,
lib/wikipedia/vandalism_detection/training_dataset.rb,
lib/wikipedia/vandalism_detection/features/blanking.rb,
lib/wikipedia/vandalism_detection/features/copyedit.rb,
lib/wikipedia/vandalism_detection/features/reverted.rb,
lib/wikipedia/vandalism_detection/word_lists/biased.rb,
lib/wikipedia/vandalism_detection/word_lists/markup.rb,
lib/wikipedia/vandalism_detection/feature_calculator.rb,
lib/wikipedia/vandalism_detection/features/anonymity.rb,
lib/wikipedia/vandalism_detection/features/bad_impact.rb,
lib/wikipedia/vandalism_detection/features/sex_impact.rb,
lib/wikipedia/vandalism_detection/features/size_ratio.rb,
lib/wikipedia/vandalism_detection/word_lists/pronouns.rb,
lib/wikipedia/vandalism_detection/features/digit_ratio.rb,
lib/wikipedia/vandalism_detection/features/impact_base.rb,
lib/wikipedia/vandalism_detection/features/same_editor.rb,
lib/wikipedia/vandalism_detection/features/time_of_day.rb,
lib/wikipedia/vandalism_detection/word_lists/emoticons.rb,
lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb,
lib/wikipedia/vandalism_detection/features/article_size.rb,
lib/wikipedia/vandalism_detection/features/longest_word.rb,
lib/wikipedia/vandalism_detection/features/removed_size.rb,
lib/wikipedia/vandalism_detection/features/bad_frequency.rb,
lib/wikipedia/vandalism_detection/features/biased_impact.rb,
lib/wikipedia/vandalism_detection/features/contains_base.rb,
lib/wikipedia/vandalism_detection/features/inserted_size.rb,
lib/wikipedia/vandalism_detection/features/markup_impact.rb,
lib/wikipedia/vandalism_detection/features/personal_life.rb,
lib/wikipedia/vandalism_detection/features/removed_words.rb,
lib/wikipedia/vandalism_detection/features/sex_frequency.rb,
lib/wikipedia/vandalism_detection/features/time_interval.rb,
lib/wikipedia/vandalism_detection/features/comment_length.rb,
lib/wikipedia/vandalism_detection/features/edits_per_user.rb,
lib/wikipedia/vandalism_detection/features/frequency_base.rb,
lib/wikipedia/vandalism_detection/features/inserted_words.rb,
lib/wikipedia/vandalism_detection/features/pronoun_impact.rb,
lib/wikipedia/vandalism_detection/features/size_increment.rb,
lib/wikipedia/vandalism_detection/features/term_frequency.rb,
lib/wikipedia/vandalism_detection/features/compressibility.rb,
lib/wikipedia/vandalism_detection/features/user_reputation.rb,
lib/wikipedia/vandalism_detection/features/words_increment.rb,
lib/wikipedia/vandalism_detection/features/biased_frequency.rb,
lib/wikipedia/vandalism_detection/features/emoticons_impact.rb,
lib/wikipedia/vandalism_detection/features/markup_frequency.rb,
lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb,
lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb,
lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb,
lib/wikipedia/vandalism_detection/features/anonymity_previous.rb,
lib/wikipedia/vandalism_detection/features/character_sequence.rb,
lib/wikipedia/vandalism_detection/features/character_diversity.rb,
lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb,
lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb,
lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb,
lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb,
lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb,
lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb,
lib/wikipedia/vandalism_detection/features/replacement_similarity.rb,
lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb,
lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb,
lib/wikipedia/vandalism_detection/features/inserted_external_links.rb,
lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb,
lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb,
lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb,
lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb,
lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb,
lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb,
lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb,
lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb,
lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb,
lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb,
lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb,
lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb

Overview

The WikitextExtractor imports the WikitextExtractor class from the sweble-wikitext-extractor.jar The sweble-wikitext-extractor.jar is a custom Java project which uses the Sweble wikitext parser to extract plaintext out of wikimarkup text.

The Sweble WikitextExtractor currently depends on the swc-engine -v1.1.0 with dependencies, see: sweble.org/downloads/swc-devel/master-latest/ to download it.

The Java source code can be found on: webis.uni-weimar.de:/srv/cvsroot/code-in-progress/wikipedia-vandalism-detection/sweble-wikitext-extractor

Author:

Defined Under Namespace

Modules: VandalismDetection

Class Method Summary collapse

Class Method Details

.api_base_uriObject



7
8
9
# File 'lib/wikipedia.rb', line 7

def self.api_base_uri
  "http://en.wikipedia.org/w/api.php?format=xml&action=query&"
end

.api_request(params = {}) ⇒ Object



39
40
41
42
43
# File 'lib/wikipedia.rb', line 39

def api_request(params = {})
  uri = URI::encode(api_base_uri + param_string(params))
  content = request_with_retry(uri, 3)
  Nokogiri::XML(content)
end

.param_string(params) ⇒ Object



15
16
17
# File 'lib/wikipedia.rb', line 15

def self.param_string(params)
  params.map{ |k, v| "#{k}=#{v}" }.join('&')
end

.request_with_retry(uri, times = 1, timeout = 5) ⇒ Object

Retries to call the request in the case of Timeout errors



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/wikipedia.rb', line 20

def self.request_with_retry(uri, times = 1, timeout = 5)
  content = ""

  begin
    Timeout::timeout(timeout) do
      content = URI.parse(uri).read
    end
  rescue => e
    if times > 0
      times -= 1
      retry
    else
      raise "#{times} times retrying request failed.\n#{e.message}"
    end
  end

  content
end

.wikitrust_base_uriObject



11
12
13
# File 'lib/wikipedia.rb', line 11

def self.wikitrust_base_uri
  "http://en.collaborativetrust.com/WikiTrust/RemoteAPI?method=wikimarkup&"
end

.wikitrust_request(params = {}) ⇒ Object



45
46
47
48
# File 'lib/wikipedia.rb', line 45

def wikitrust_request(params = {})
  uri = URI::encode(wikitrust_base_uri + param_string(params))
  request_with_retry(uri, 3)
end