Class: TextPreprocessing

Inherits:
Object
  • Object
show all
Defined in:
lib/automated_metareview/text_preprocessing.rb

Instance Method Summary collapse

Instance Method Details

#check_correct_spellings(review_text_array, speller) ⇒ Object

Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby



269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/automated_metareview/text_preprocessing.rb', line 269

def check_correct_spellings(review_text_array, speller)
  review_text_array_temp = Array.new
  #iterating through each response
  review_text_array.each{
    |review_text|
    review_tokens = review_text.split(" ")
    review_text_temp = ""
    #iterating through tokens from each response
    review_tokens.each{
      |review_tok|
      #checkiing the stem word's spelling for correctness
      if(!speller.check(review_tok))
        if(!speller.suggest(review_tok).first.nil?)
          review_tok = speller.suggest(review_tok).first
        end
     end
     review_text_temp = review_text_temp +" " + review_tok.downcase
    }
    review_text_array_temp << review_text_temp
  }
  return review_text_array_temp
end

#contains_punct(str) ⇒ Object

The method was throwing a “NoMethodError: private method” error when called from a different class. Hence the “public” keyword.



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/automated_metareview/text_preprocessing.rb', line 297

def contains_punct(str)
  if(str.include?".")
    str.gsub!(".","")
  elsif(str.include?",")
    str.gsub!(",","")
  elsif(str.include?"?")
    str.gsub!("?","")
  elsif(str.include?"!")
    str.gsub!("!","") 
  elsif(str.include?";")
    str.gsub(";","")
  elsif(str.include?":")
    str.gsub!(":","")
  elsif(str.include?"(")
    str.gsub!("(","")
  elsif(str.include?")")
    str.gsub!(")","")
  elsif(str.include?"[")
    str.gsub!("[","")
  elsif(str.include?"]")
    str.gsub!("]","")  
  end 
  return str
end

#contains_punct_bool(str) ⇒ Object



322
323
324
325
326
327
328
# File 'lib/automated_metareview/text_preprocessing.rb', line 322

def contains_punct_bool(str)
  if(str.include?("\\n") or str.include?("}") or str.include?("{"))
    return true
  else
    return false
  end 
end

#fetch_review_data(auto_metareview, map_id) ⇒ Object

Fetching review data from the tables based on the response_map id



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/automated_metareview/text_preprocessing.rb', line 10

def fetch_review_data(auto_metareview, map_id)
  reviews = Array.new
  responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
  auto_metareview.responses = responses
  auto_metareview.response_id = responses.id
  # puts "auto_metareview.response_id #{auto_metareview.response_id}"
  # puts "responses updated_at #{responses.updated_at}"
  responses.scores.each{
    | review_score |
    if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
      # puts review_score.comments
      reviews << review_score.comments        
    end
  }
  return reviews
end

#fetch_submission_data(map_id) ⇒ Object

Fetching submission data from the url submitted by the reviewee



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/automated_metareview/text_preprocessing.rb', line 30

def fetch_submission_data(map_id)
  subm_array = Array.new
  response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
  reviewee_id = response_map.reviewee_id
  reviewed_object = response_map.reviewed_object_id
  url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).
  if(url.nil?)#in case of team assignments  
    teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
    teams_users.each{
      |team_user|
      url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).
      if(!url.nil?)#break out when you find the url
        break
      end
    }
  end
  # puts "***url #{url} #{url}"  
  #fetching the url submitted by the reviewee
  url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
  # puts "***url #{url} #{url.class}" 
  page = Nokogiri::HTML(open(url))
  #fetching the paragraph texts from the specified url
  if(page.css('p').count != 0)
    page.css('p').each do |subm|
      # puts "subm.text.. #{subm.text}"
      subm_array << subm.text 
    end 
  end
  #for google docs where the text is placed inside <script></script> tags
  if(page.css('script').count != 0)
    page.css('script').each do |subm|
      if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
        subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
      end
    end
  end
  return subm_array  
end

#is_punct(str) ⇒ Object

Checking if “str” is a punctuation mark like “.”, “,”, “?” etc.



334
335
336
337
338
339
340
# File 'lib/automated_metareview/text_preprocessing.rb', line 334

def is_punct(str)
  if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
    return true
  else
    return false
  end 
end

#read_patterns(filename, pos) ⇒ Object

  • Reads the patterns from the csv file containing them.

    * maxValue is the maximum value of the patterns found
    


160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/automated_metareview/text_preprocessing.rb', line 160

def read_patterns(filename, pos)
  num = 1000 #some large number
  patterns = Array.new
  state = POSITIVE
  i = 0 #keeps track of the number of edges
  
  #setting the state for problem detection and suggestive patterns
  if(filename.include?("prob"))
      state = NEGATED
  elsif(filename.include?("suggest"))
      state = SUGGESTIVE
  end
    
  FasterCSV.foreach(filename) do |text|
    in_vertex = text[0][0..text[0].index("=")-1].strip
    out_vertex = text[0][text[0].index("=")+2..text[0].length].strip

    first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
    first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
      
     patterns[i] = Edge.new("noun", NOUN)
     #setting the invertex
     if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
          patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
     elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
      patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
     elsif(first_string_in_vertex.include?("JJ"))
      patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
     elsif(first_string_in_vertex.include?("/RB"))
      patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length]) 
     else #default to noun
      patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
     end      
     
     #setting outvertex
     if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
      patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
     elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
      patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
     elsif(first_string_out_vertex.include?("JJ"))
      patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
     elsif(first_string_out_vertex.include?("/RB"))
      patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])  
    else #default is noun
      patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
    end
    i+=1 #incrementing for each pattern 
  end #end of the FasterCSV.foreach loop
  num_patterns = i
  return patterns
end

#remove_text_within_quotes(review_text) ⇒ Object

Check for plagiarism after removing text within quotes for reviews



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/automated_metareview/text_preprocessing.rb', line 237

def remove_text_within_quotes(review_text)
  # puts "Inside removeTextWithinQuotes:: "
  reviews = Array.new
  review_text.each{ |row|
    # puts "row #{row}"
    text = row 
    #text = text[1..text.length-2] #since the first and last characters are quotes
    #puts "text #{text}"
    #the read text is tagged with two sets of quotes!
    if(text.include?("\""))
      while(text.include?("\"")) do
        replace_text = text.scan(/"([^"]*)"/)
        # puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
        # puts text.index(replace_text[0].to_s)
        # puts "replace_text length .. #{replace_text[0].to_s.length}"
        #fetching the start index of the quoted text, in order to replace the complete segment
        start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
        # puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
        #replacing the text segment within the quotes (including the quotes) with an empty string
        text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
        # puts "text .. #{text}"
      end #end of the while loop
    end
    reviews << text #set the text after all quoted segments have been removed.
  } #end of the loop for "text" array
  # puts "returning reviews length .. #{reviews.length}"
  return reviews #return only the first array element - a string!
end

#remove_urls(text) ⇒ Object

Removes any urls in the text and returns the remaining text as it is



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/automated_metareview/text_preprocessing.rb', line 217

def remove_urls(text)
  final_text = String.new
  if(text.include?("http://"))
    tokens = text.split(" ")
    tokens.each{
      |token|
      if(!token.include?("http://"))
        final_text = final_text + " " + token
      end  
    }
  else
    return text
  end
  return final_text  
end

#segment_text(flag, text_array) ⇒ Object

pre-processes the review text and sends it in for graph formation and further analysis



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/automated_metareview/text_preprocessing.rb', line 72

def segment_text(flag, text_array)
  if(flag == 0)
    reviews = Array.new(1){Array.new}
  else
    reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
  end
  
  i = 0
  j = 0
  
  for k in (0..text_array.length-1)
    text = text_array[k]
    if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
      reviews[j] = Array.new #initializing the array for sentences in a test review
      i = 0
    end
    
    #******* Pre-processing the review/submission text **********
    #replacing commas in large numbers, makes parsing sentences with commas confusing!
    #replacing quotation marks
    text.gsub!("\"", "")
    text.gsub!("(", "")
    text.gsub!(")", "")
    if(text.include?("http://"))
      text = remove_urls(text)
    end
    #break the text into multiple sentences
    beginn = 0
    if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
      while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
        endd = 0
        #these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
        if(text.include?("."))
          endd = text.index(".")
        end
        if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
          endd = text.index("?")
        end
        if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
          endd = text.index("!")
        end
        if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or ! 
          endd = text.index(",")
        end
        if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or , 
          endd = text.index(";")
        end
              
        #check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "... 
        if(flag == 0) #training
          reviews[0][i] = text[beginn..endd].strip
        else #testing
          reviews[j][i] = text[beginn..endd].strip
        end        
        i+=1 #incrementing the sentence counter
        text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
      end #end of the while loop   
    else #if there is only 1 sentence in the text
      if(flag == 0)#training            
        reviews[0][i] = text.strip
        i+=1 #incrementing the sentence counter
      else #testing
        reviews[j][i] = text.strip
      end
    end
  
    if(flag == 1)#incrementing reviews counter only for test reviews
      j+=1
    end 
  end #end of the for loop with 'k' reading text rows
  
  #setting the number of reviews before returning
  if(flag == 0)#training
    num_reviews = 1 #for training the number of reviews is 1
  else #testing
    num_reviews = j
  end

  if(flag == 0)
    return reviews[0]
  end
end