Module: Licensee::ContentHelper
- Included in:
- License, ProjectFiles::LicenseFile
- Defined in:
- lib/licensee/content_helper.rb
Constant Summary collapse
- DIGEST =
Digest::SHA1
- START_REGEX =
/\A\s*/
- END_OF_TERMS_REGEX =
/^[\s#*_]*end of (the )?terms and conditions[\s#*_]*$/i
- REGEXES =
{ bom: /#{START_REGEX}\xEF\xBB\xBF/, hrs: /^\s*[=\-*]{3,}\s*$/, all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i, whitespace: /\s+/, markdown_headings: /^\s*#+/, version: /#{START_REGEX}version.*$/i, span_markup: /[_*~]+(.*?)[_*~]+/, link_markup: /\[(.+?)\]\(.+?\)/, block_markup: /^\s*>/, border_markup: /^[*-](.*?)[*-]$/, comment_markup: %r{^\s*?[/*]{1,2}}, url: %r{#{START_REGEX}https?://[^ ]+\n}, bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[).])\s+/i, developed_by: /#{START_REGEX}developed by:.*?\n\n/im, cc_dedication: /The\s+text\s+of\s+the\s+Creative\s+Commons.*?Public\s+Domain\s+Dedication./im, cc_wiki: /wiki.creativecommons.org/i, cc_legal_code: /^\s*Creative Commons Legal Code\s*$/i, cc0_info: /For more information, please see\s*\S+zero\S+/im, cc0_disclaimer: /CREATIVE COMMONS CORPORATION.*?\n\n/im, unlicense_info: /For more information, please.*\S+unlicense\S+/im, mit_optional: /\(including the next paragraph\)/i }.freeze
- NORMALIZATIONS =
{ lists: { from: /^\s*(?:\d\.|[*-])(?: [*_]{0,2}\(?[\da-z]\)[*_]{0,2})?\s+([^\n])/, to: '- \1' }, https: { from: /http:/, to: 'https:' }, ampersands: { from: '&', to: 'and' }, dashes: { from: /(?<!^)([—–-]+)(?!$)/, to: '-' }, quote: { from: /[`'"‘“’”]/, to: "'" }, hyphenated: { from: /(\w+)-\s*\n\s*(\w+)/, to: '\1-\2' } }.freeze
- VARIETAL_WORDS =
Legally equivalent words that schould be ignored for comparison See spdx.org/spdx-license-list/matching-guidelines
{ 'acknowledgment' => 'acknowledgement', 'analogue' => 'analog', 'analyse' => 'analyze', 'artefact' => 'artifact', 'authorisation' => 'authorization', 'authorised' => 'authorized', 'calibre' => 'caliber', 'cancelled' => 'canceled', 'capitalisations' => 'capitalizations', 'catalogue' => 'catalog', 'categorise' => 'categorize', 'centre' => 'center', 'emphasised' => 'emphasized', 'favour' => 'favor', 'favourite' => 'favorite', 'fulfil' => 'fulfill', 'fulfilment' => 'fulfillment', 'initialise' => 'initialize', 'judgment' => 'judgement', 'labelling' => 'labeling', 'labour' => 'labor', 'licence' => 'license', 'maximise' => 'maximize', 'modelled' => 'modeled', 'modelling' => 'modeling', 'offence' => 'offense', 'optimise' => 'optimize', 'organisation' => 'organization', 'organise' => 'organize', 'practise' => 'practice', 'programme' => 'program', 'realise' => 'realize', 'recognise' => 'recognize', 'signalling' => 'signaling', 'sub-license' => 'sublicense', 'sub license' => 'sublicense', 'utilisation' => 'utilization', 'whilst' => 'while', 'wilful' => 'wilfull', 'non-commercial' => 'noncommercial', 'per cent' => 'percent', 'copyright owner' => 'copyright holder' }.freeze
- STRIP_METHODS =
%i[ bom cc_optional cc0_optional unlicense_optional borders title version url copyright title block_markup developed_by end_of_terms whitespace mit_optional ].freeze
Class Method Summary collapse
-
.const_missing(const) ⇒ Object
Backwards compatibalize constants to avoid a breaking change.
- .format_percent(float) ⇒ Object
- .title_regex ⇒ Object
-
.wrap(text, line_width = 80) ⇒ Object
Wrap text to the given line length.
Instance Method Summary collapse
-
#content_hash ⇒ Object
SHA1 of the normalized content.
- #content_normalized(wrap: nil) ⇒ Object
-
#content_without_title_and_version ⇒ Object
Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
-
#length ⇒ Object
Number of characters in the normalized content.
-
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length.
-
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.
-
#wordset ⇒ Object
A set of each word in the license, without duplicates.
Class Method Details
.const_missing(const) ⇒ Object
Backwards compatibalize constants to avoid a breaking change
171 172 173 174 |
# File 'lib/licensee/content_helper.rb', line 171 def self.const_missing(const) key = const.to_s.downcase.gsub('_regex', '').to_sym REGEXES[key] || super end |
.format_percent(float) ⇒ Object
195 196 197 |
# File 'lib/licensee/content_helper.rb', line 195 def self.format_percent(float) "#{format('%<float>.2f', float: float)}%" end |
.title_regex ⇒ Object
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/licensee/content_helper.rb', line 199 def self.title_regex @title_regex ||= begin licenses = Licensee::License.all(hidden: true, psuedo: false) titles = licenses.map(&:title_regex) # Title regex must include the version to support matching within # families, but for sake of normalization, we can be less strict without_versions = licenses.map do |license| next if license.title == license.name_without_version Regexp.new Regexp.escape(license.name_without_version), 'i' end titles.concat(without_versions.compact) /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i end end |
.wrap(text, line_width = 80) ⇒ Object
Wrap text to the given line length
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/licensee/content_helper.rb', line 177 def self.wrap(text, line_width = 80) return if text.nil? text = text.clone text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" } text.gsub!(/([^\n])\n([^\n])/, '\1 \2') text = text.split("\n").collect do |line| if line =~ REGEXES[:hrs] || line.length <= line_width line else line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip end end * "\n" text.strip end |
Instance Method Details
#content_hash ⇒ Object
SHA1 of the normalized content
136 137 138 |
# File 'lib/licensee/content_helper.rb', line 136 def content_hash @content_hash ||= DIGEST.hexdigest content_normalized end |
#content_normalized(wrap: nil) ⇒ Object
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/licensee/content_helper.rb', line 153 def content_normalized(wrap: nil) @content_normalized ||= begin @_content = content_without_title_and_version.downcase (NORMALIZATIONS.keys + %i[spelling span_markup bullets]).each { |op| normalize(op) } STRIP_METHODS.each { |op| strip(op) } _content end if wrap.nil? @content_normalized else Licensee::ContentHelper.wrap(@content_normalized, wrap) end end |
#content_without_title_and_version ⇒ Object
Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile
144 145 146 147 148 149 150 151 |
# File 'lib/licensee/content_helper.rb', line 144 def content_without_title_and_version @content_without_title_and_version ||= begin @_content = nil ops = %i[html hrs comments markdown_headings link_markup title version] ops.each { |op| strip(op) } _content end end |
#length ⇒ Object
Number of characters in the normalized content
113 114 115 116 117 |
# File 'lib/licensee/content_helper.rb', line 113 def length return 0 unless content_normalized content_normalized.length end |
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length
120 121 122 |
# File 'lib/licensee/content_helper.rb', line 120 def length_delta(other) (length - other.length).abs end |
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.
128 129 130 131 132 133 |
# File 'lib/licensee/content_helper.rb', line 128 def similarity(other) overlap = (wordset_fieldless & other.wordset).size total = wordset_fieldless.size + other.wordset.size - fields_normalized_set.size (overlap * 200.0) / (total + (variation_adjusted_length_delta(other) / 4)) end |
#wordset ⇒ Object
A set of each word in the license, without duplicates
108 109 110 |
# File 'lib/licensee/content_helper.rb', line 108 def wordset @wordset ||= content_normalized&.scan(%r{(?:[\w/-](?:'s|(?<=s)')?)+})&.to_set end |