Module: RubricLLM
- Defined in:
- lib/rubric_llm.rb,
lib/rubric_llm/judge.rb,
lib/rubric_llm/rspec.rb,
lib/rubric_llm/config.rb,
lib/rubric_llm/errors.rb,
lib/rubric_llm/report.rb,
lib/rubric_llm/result.rb,
lib/rubric_llm/version.rb,
lib/rubric_llm/minitest.rb,
lib/rubric_llm/evaluator.rb,
lib/rubric_llm/comparison.rb,
lib/rubric_llm/metrics/base.rb,
lib/rubric_llm/retrieval_result.rb,
lib/rubric_llm/metrics/relevance.rb,
lib/rubric_llm/metrics/correctness.rb,
lib/rubric_llm/metrics/faithfulness.rb,
lib/rubric_llm/metrics/context_recall.rb,
lib/rubric_llm/metrics/factual_accuracy.rb,
lib/rubric_llm/metrics/context_precision.rb
Defined Under Namespace
Modules: Assertions, Metrics, RSpecMatchers Classes: Comparison, Config, ConfigurationError, Error, Evaluator, Judge, JudgeError, Report, Result, RetrievalResult
Constant Summary collapse
- VERSION =
"0.1.1"
Class Method Summary collapse
-
.compare(report_a, report_b) ⇒ Object
Compare two Reports with paired t-tests.
- .config ⇒ Object
- .configure {|new_config| ... } ⇒ Object
-
.evaluate(question:, answer:, context: [], ground_truth: nil, metrics: nil, config: self.config, custom_prompt: nil) ⇒ Object
Evaluate a single sample against all (or selected) metrics.
-
.evaluate_batch(dataset, metrics: nil, config: self.config, custom_prompt: nil, concurrency: nil) ⇒ Object
Evaluate a batch of samples and return a Report.
-
.evaluate_retrieval(retrieved:, relevant:) ⇒ Object
Evaluate retrieval quality without LLM calls.
- .reset_configuration! ⇒ Object
Class Method Details
.compare(report_a, report_b) ⇒ Object
Compare two Reports with paired t-tests.
comparison = RubricLLM.compare(report_a, report_b)
77 78 79 |
# File 'lib/rubric_llm.rb', line 77 def compare(report_a, report_b) Comparison.new(report_a, report_b) end |
.configure {|new_config| ... } ⇒ Object
28 29 30 31 32 33 |
# File 'lib/rubric_llm.rb', line 28 def configure new_config = Config.new(**config.to_h) yield(new_config) new_config.validate! @config = new_config end |
.evaluate(question:, answer:, context: [], ground_truth: nil, metrics: nil, config: self.config, custom_prompt: nil) ⇒ Object
Evaluate a single sample against all (or selected) metrics.
result = RubricLLM.evaluate(
question: "What is the capital of France?",
answer: "Paris",
context: ["Paris is the capital of France."],
ground_truth: "Paris"
)
47 48 49 50 51 52 |
# File 'lib/rubric_llm.rb', line 47 def evaluate(question:, answer:, context: [], ground_truth: nil, metrics: nil, config: self.config, custom_prompt: nil) config = apply_custom_prompt(config, custom_prompt) evaluator = Evaluator.new(config:, metrics:) evaluator.call(question:, answer:, context:, ground_truth:) end |
.evaluate_batch(dataset, metrics: nil, config: self.config, custom_prompt: nil, concurrency: nil) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/rubric_llm.rb', line 58 def evaluate_batch(dataset, metrics: nil, config: self.config, custom_prompt: nil, concurrency: nil) config = apply_custom_prompt(config, custom_prompt) pool_size = concurrency || config.concurrency start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) results = if pool_size > 1 evaluate_batch_threaded(dataset, config:, metrics:, pool_size:) else evaluator = Evaluator.new(config:, metrics:) dataset.map { |sample| evaluate_sample(evaluator, sample) } end duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time Report.new(results:, duration:) end |
.evaluate_retrieval(retrieved:, relevant:) ⇒ Object
Evaluate retrieval quality without LLM calls.
result = RubricLLM.evaluate_retrieval(retrieved: [...], relevant: [...])
84 85 86 |
# File 'lib/rubric_llm.rb', line 84 def evaluate_retrieval(retrieved:, relevant:) RetrievalResult.new(retrieved:, relevant:) end |
.reset_configuration! ⇒ Object
35 36 37 |
# File 'lib/rubric_llm.rb', line 35 def reset_configuration! @config = nil end |