Class: Gromit::MarkdownParser

Inherits:

Object

Object
Gromit::MarkdownParser

show all

Defined in:: lib/gromit/markdown_parser.rb

Instance Attribute Summary collapse

#file_path ⇒ Object readonly

Returns the value of attribute file_path.
#sections ⇒ Object readonly

Returns the value of attribute sections.

Class Method Summary collapse

Instance Method Summary collapse

#initialize(file_path) ⇒ MarkdownParser constructor

A new instance of MarkdownParser.
#parse_file ⇒ Object

Constructor Details

#initialize(file_path) ⇒ `MarkdownParser`

Returns a new instance of MarkdownParser.

# File 'lib/gromit/markdown_parser.rb', line 116

def initialize(file_path)
  @file_path = file_path
  @sections = []
  parse_file
end

Instance Attribute Details

#file_path ⇒ `Object` (readonly)

Returns the value of attribute file_path.



4
5
6

# File 'lib/gromit/markdown_parser.rb', line 4

def file_path
  @file_path
end

#sections ⇒ `Object` (readonly)

Returns the value of attribute sections.



4
5
6

# File 'lib/gromit/markdown_parser.rb', line 4

def sections
  @sections
end

Class Method Details

.get_embedding(section, section_id) ⇒ `Object`

# File 'lib/gromit/markdown_parser.rb', line 56

def get_embedding(section, section_id)
  token_count = nil
  embedding = nil
  data = redis.get(section_id)

  if data.nil?
    # OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
    input = section.gsub(/\n/m, ' ')
    response = openai.embeddings(parameters: { input: input, model: "text-embedding-ada-002"})

    token_count = response['usage']['total_tokens']
    embedding = response['data'].first['embedding']

    redis.set(section_id, {'token_count' => token_count, 'embedding' => embedding}.to_json)
  else
    cached_embedding = JSON.parse(data)
    token_count = cached_embedding['token_count']
    embedding = cached_embedding['embedding']
  end
  [token_count, embedding]
end

.git_file_list(dir) ⇒ `Object`



78
79
80

# File 'lib/gromit/markdown_parser.rb', line 78

def git_file_list(dir)
  `cd #{dir} && git ls-files`.split("\n")
end

.openai ⇒ `Object`



12
13
14

# File 'lib/gromit/markdown_parser.rb', line 12

def openai
  @@openai ||= OpenAI::Client.new(access_token: ENV.fetch("OPENAPI_ACCESS_TOKEN"))
end

.process(directory) ⇒ `Object`

# File 'lib/gromit/markdown_parser.rb', line 16

def process(directory)

  git_file_list(directory).map do |file|
    file_path = File.expand_path(File.join(directory, file))

    next unless file.match(/\.md$/)

    puts "processing file: #{file_path}"

    page_id = Digest::SHA1.hexdigest(file_path)
    title = File.read(file_path).lines.first.strip.gsub(/^#\s+/,'')
    checksum = Digest::MD5.hexdigest(File.read(file_path))

    parser = Gromit::MarkdownParser.new(file_path)
    sections = parser.sections.map do |section|
      section = process_markdown(section)
      section_title = section.lines.first.strip.gsub(/^i[#]+\s+/,'')
      section_id = Digest::MD5.hexdigest(section)
      id = Digest::MD5.hexdigest("#{page_id}:#{section_id}")

      # get cached embedding or call out to openai
      token_count, embedding = get_embedding(section, section_id)

      {
        id: id,
        page_id: page_id,
        section_id: section_id,
        file: file,
        title: title,
        section_title: section_title,
        content: section,
        checksum: checksum,
        token_count: token_count,
        embedding: embedding,
      }

    end
  end.compact.flatten
end

.process_markdown(file) ⇒ `Object`

# File 'lib/gromit/markdown_parser.rb', line 82

def process_markdown(file)
  mkdocs_url = "https://docs-mkdocs.releaseapp.io"

  # handle images
  # TODO: deal with spaces??
  file.gsub!(/(!\[[^\]]*?\])([\(<]+)[\.\/]+\.gitbook\/assets\/(.*?)([\)>]+)/m) do
    "#{$1}#{$2}#{mkdocs_url}/img/#{$3}#{$4}"
  end

  # remove the .md extension from the end of the URLs from gitbook
  file.gsub!(/(\[[^\]]+?\])\((.*?)\.md([#a-z0-9]*)\)/) do |match|
    "#{$1}(#{$2}#{$3})"
  end

  # handle "mentions"
  file.gsub!(/\[([^\]]+?).md\]\((.*?)\.md([#a-z0-9]*) "mention"\)/) do |match|
    link = "#{$2}#{$3}"
    title = ActiveSupport::Inflector.titleize($1.gsub("-", ' '))
    "[#{title}](#{link})"
  end

  # convert gitbook hints to admonitions
  # multi-line shortest match ...
  file.gsub!(/{%\s+hint style="(.*?)"\s+?%}(.*?){% endhint %}/m) do |match|
    ret = "!!! #{$1}\n"
    ret += $2.lines.map{|line| "    #{line}" }.join()
    ret
  end

  file
end

.redis ⇒ `Object`



8
9
10

# File 'lib/gromit/markdown_parser.rb', line 8

def redis
  @@redis ||= Redis.new(host: ENV.fetch("REDIS_HOST") { "127.0.0.1" }, port: ENV.fetch("REDIS_PORT") { "6379" }.to_i)
end

Instance Method Details

#parse_file ⇒ `Object`

# File 'lib/gromit/markdown_parser.rb', line 122

def parse_file
  current_section = []
  File.read(file_path).lines do |line|
    # Check if the line is a header (starts with one or more '#' characters)
    if header?(line)
      # Save the previous section if it's not empty
      @sections << current_section.join unless current_section.empty?
      # Start a new section
      current_section = [line]
    else
      # Add the line to the current section
      current_section << line
    end
  end
  # Save the last section if it's not empty
  @sections << current_section.join unless current_section.empty?
end

Class: Gromit::MarkdownParser

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path) ⇒ MarkdownParser

Instance Attribute Details

#file_path ⇒ Object (readonly)

#sections ⇒ Object (readonly)

Class Method Details

.get_embedding(section, section_id) ⇒ Object

.git_file_list(dir) ⇒ Object

.openai ⇒ Object

.process(directory) ⇒ Object

.process_markdown(file) ⇒ Object

.redis ⇒ Object

Instance Method Details

#parse_file ⇒ Object

#initialize(file_path) ⇒ `MarkdownParser`

#file_path ⇒ `Object` (readonly)

#sections ⇒ `Object` (readonly)

.get_embedding(section, section_id) ⇒ `Object`

.git_file_list(dir) ⇒ `Object`

.openai ⇒ `Object`

.process(directory) ⇒ `Object`

.process_markdown(file) ⇒ `Object`

.redis ⇒ `Object`

#parse_file ⇒ `Object`