Module: Licensee::ContentHelper

Included in:: License, ProjectFiles::LicenseFile

Defined in:: lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =

Digest::SHA1

START_REGEX =

/\A\s*/

END_OF_TERMS_REGEX =

/^[\s#*_]*end of (the )?terms and conditions[\s#*_]*$/i

REGEXES =

{
  bom:                 /#{START_REGEX}\xEF\xBB\xBF/,
  hrs:                 /^\s*[=\-*]{3,}\s*$/,
  all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
  whitespace:          /\s+/,
  markdown_headings:   /^\s*#+/,
  version:             /#{START_REGEX}version.*$/i,
  span_markup:         /[_*~]+(.*?)[_*~]+/,
  link_markup:         /\[(.+?)\]\(.+?\)/,
  block_markup:        /^\s*>/,
  border_markup:       /^[*-](.*?)[*-]$/,
  comment_markup:      %r{^\s*?[/*]{1,2}},
  url:                 %r{#{START_REGEX}https?://[^ ]+\n},
  bullet:              /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[).])\s+/i,
  developed_by:        /#{START_REGEX}developed by:.*?\n\n/im,
  cc_dedication:       /The\s+text\s+of\s+the\s+Creative\s+Commons.*?Public\s+Domain\s+Dedication./im,
  cc_wiki:             /wiki.creativecommons.org/i,
  cc_legal_code:       /^\s*Creative Commons Legal Code\s*$/i,
  cc0_info:            /For more information, please see\s*\S+zero\S+/im,
  cc0_disclaimer:      /CREATIVE COMMONS CORPORATION.*?\n\n/im,
  unlicense_info:      /For more information, please.*\S+unlicense\S+/im,
  mit_optional:        /\(including the next paragraph\)/i
}.freeze

NORMALIZATIONS =

{
  lists:      { from: /^\s*(?:\d\.|[*-])(?: [*_]{0,2}\(?[\da-z]\)[*_]{0,2})?\s+([^\n])/, to: '- \1' },
  https:      { from: /http:/, to: 'https:' },
  ampersands: { from: '&', to: 'and' },
  dashes:     { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
  quote:      { from: /[`'"‘“’”]/, to: "'" },
  hyphenated: { from: /(\w+)-\s*\n\s*(\w+)/, to: '\1-\2' }
}.freeze

VARIETAL_WORDS = Legally equivalent words that schould be ignored for comparison See spdx.org/spdx-license-list/matching-guidelines

{
  'acknowledgment'  => 'acknowledgement',
  'analogue'        => 'analog',
  'analyse'         => 'analyze',
  'artefact'        => 'artifact',
  'authorisation'   => 'authorization',
  'authorised'      => 'authorized',
  'calibre'         => 'caliber',
  'cancelled'       => 'canceled',
  'capitalisations' => 'capitalizations',
  'catalogue'       => 'catalog',
  'categorise'      => 'categorize',
  'centre'          => 'center',
  'emphasised'      => 'emphasized',
  'favour'          => 'favor',
  'favourite'       => 'favorite',
  'fulfil'          => 'fulfill',
  'fulfilment'      => 'fulfillment',
  'initialise'      => 'initialize',
  'judgment'        => 'judgement',
  'labelling'       => 'labeling',
  'labour'          => 'labor',
  'licence'         => 'license',
  'maximise'        => 'maximize',
  'modelled'        => 'modeled',
  'modelling'       => 'modeling',
  'offence'         => 'offense',
  'optimise'        => 'optimize',
  'organisation'    => 'organization',
  'organise'        => 'organize',
  'practise'        => 'practice',
  'programme'       => 'program',
  'realise'         => 'realize',
  'recognise'       => 'recognize',
  'signalling'      => 'signaling',
  'sub-license'     => 'sublicense',
  'sub license'     => 'sublicense',
  'utilisation'     => 'utilization',
  'whilst'          => 'while',
  'wilful'          => 'wilfull',
  'non-commercial'  => 'noncommercial',
  'per cent'        => 'percent',
  'copyright owner' => 'copyright holder'
}.freeze

STRIP_METHODS =

%i[
  bom
  cc_optional
  cc0_optional
  unlicense_optional
  borders
  title
  version
  url
  copyright
  title
  block_markup
  developed_by
  end_of_terms
  whitespace
  mit_optional
].freeze

Class Method Summary collapse

.const_missing(const) ⇒ Object

Backwards compatibalize constants to avoid a breaking change.
.format_percent(float) ⇒ Object
.title_regex ⇒ Object
.wrap(text, line_width = 80) ⇒ Object

Wrap text to the given line length.

Instance Method Summary collapse

#content_hash ⇒ Object

SHA1 of the normalized content.
#content_normalized(wrap: nil) ⇒ Object
#content_without_title_and_version ⇒ Object

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
#length ⇒ Object

Number of characters in the normalized content.
#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length.
#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.
#wordset ⇒ Object

A set of each word in the license, without duplicates.

Class Method Details

.const_missing(const) ⇒ `Object`

Backwards compatibalize constants to avoid a breaking change

# File 'lib/licensee/content_helper.rb', line 171

def self.const_missing(const)
  key = const.to_s.downcase.gsub('_regex', '').to_sym
  REGEXES[key] || super
end

.format_percent(float) ⇒ `Object`



195
196
197

# File 'lib/licensee/content_helper.rb', line 195

def self.format_percent(float)
  "#{format('%<float>.2f', float: float)}%"
end

.title_regex ⇒ `Object`

# File 'lib/licensee/content_helper.rb', line 199

def self.title_regex
  @title_regex ||= begin
    licenses = Licensee::License.all(hidden: true, psuedo: false)
    titles = licenses.map(&:title_regex)

    # Title regex must include the version to support matching within
    # families, but for sake of normalization, we can be less strict
    without_versions = licenses.map do |license|
      next if license.title == license.name_without_version

      Regexp.new Regexp.escape(license.name_without_version), 'i'
    end
    titles.concat(without_versions.compact)

    /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
  end
end

.wrap(text, line_width = 80) ⇒ `Object`

Wrap text to the given line length

# File 'lib/licensee/content_helper.rb', line 177

def self.wrap(text, line_width = 80)
  return if text.nil?

  text = text.clone
  text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

  text = text.split("\n").collect do |line|
    if line =~ REGEXES[:hrs] || line.length <= line_width
      line
    else
      line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
    end
  end * "\n"

  text.strip
end

Instance Method Details

#content_hash ⇒ `Object`

SHA1 of the normalized content



136
137
138

# File 'lib/licensee/content_helper.rb', line 136

def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end

#content_normalized(wrap: nil) ⇒ `Object`

# File 'lib/licensee/content_helper.rb', line 153

def content_normalized(wrap: nil)
  @content_normalized ||= begin
    @_content = content_without_title_and_version.downcase

    (NORMALIZATIONS.keys + %i[spelling span_markup bullets]).each { |op| normalize(op) }
    STRIP_METHODS.each { |op| strip(op) }

    _content
  end

  if wrap.nil?
    @content_normalized
  else
    Licensee::ContentHelper.wrap(@content_normalized, wrap)
  end
end

#content_without_title_and_version ⇒ `Object`

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File 'lib/licensee/content_helper.rb', line 144

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    @_content = nil
    ops = %i[html hrs comments markdown_headings link_markup title version]
    ops.each { |op| strip(op) }
    _content
  end
end

#length ⇒ `Object`

Number of characters in the normalized content

# File 'lib/licensee/content_helper.rb', line 113

def length
  return 0 unless content_normalized

  content_normalized.length
end

#length_delta(other) ⇒ `Object`

Given another license or project file, calculates the difference in length



120
121
122

# File 'lib/licensee/content_helper.rb', line 120

def length_delta(other)
  (length - other.length).abs
end

#similarity(other) ⇒ `Object`

Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.

# File 'lib/licensee/content_helper.rb', line 128

def similarity(other)
  overlap = (wordset_fieldless & other.wordset).size
  total = wordset_fieldless.size + other.wordset.size -
          fields_normalized_set.size
  (overlap * 200.0) / (total + (variation_adjusted_length_delta(other) / 4))
end

#wordset ⇒ `Object`

A set of each word in the license, without duplicates



108
109
110

# File 'lib/licensee/content_helper.rb', line 108

def wordset
  @wordset ||= content_normalized&.scan(%r{(?:[\w/-](?:'s|(?<=s)')?)+})&.to_set
end

Module: Licensee::ContentHelper

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.const_missing(const) ⇒ Object

.format_percent(float) ⇒ Object

.title_regex ⇒ Object

.wrap(text, line_width = 80) ⇒ Object

Instance Method Details

#content_hash ⇒ Object

#content_normalized(wrap: nil) ⇒ Object

#content_without_title_and_version ⇒ Object

#length ⇒ Object

#length_delta(other) ⇒ Object

#similarity(other) ⇒ Object

#wordset ⇒ Object