Skip to content

Commit

Permalink
feat: add ability for ragas to read from a list
Browse files Browse the repository at this point in the history
We want ragas to read from both a list as well as a list of samples

Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Dec 6, 2024
1 parent 5eb2310 commit df441c1
Showing 1 changed file with 23 additions and 8 deletions.
31 changes: 23 additions & 8 deletions src/instructlab/eval/ragas.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# Standard
from pathlib import Path
from typing import List, TypedDict

# Third Party
from langchain_community.chat_models import ChatOpenAI
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
from ragas.metrics import RubricsScore
from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
DEFAULT_WITH_REFERENCE_RUBRICS,
RubricsScore,
)
import pandas as pd

# Local
from .evaluator import Evaluator
Expand All @@ -30,13 +34,13 @@ def __init__(self):
pass

def run(
self, dataset: List[Sample], run_config: RunConfig | None = None
self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
) -> EvaluationResult:
"""
Evaluates the quality of model responses against a graded rubric.
Args:
dataset (List[Sample]):
dataset (List[Sample] | Path):
List of model questions and answers
run_config (RunConfig | None, optional):
Configuration to use when running evaluations. If none is provided, then
Expand All @@ -47,6 +51,19 @@ def run(
Returns:
EvaluationResult: The results of all evaluations performed by Ragas
"""
if not dataset:
raise ValueError(
"no dataset was provided, please specify the `dataset` argument"
)
if isinstance(dataset, Path):
input_ds = EvaluationDataset.from_pandas(
pd.read_json(dataset, lines=True, orient="records")
)
elif isinstance(dataset, list):
input_ds = EvaluationDataset.from_list(dataset)
else:
raise TypeError(f"invalid type passed for dataset: {type(dataset)}")

if not run_config:
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
# are horrible and will result in half of our evaluation results being NaN or 0
Expand All @@ -57,17 +74,15 @@ def run(
timeout=3600,
)

# we will be using gpt-4o for the foreseeable future, we hardcode this
# for consistency of answers
input_ds = EvaluationDataset.from_list(dataset)

# default set of metrics
metrics = [
RubricsScore(
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
)
]

# we will be using gpt-4o for the foreseeable future, we hardcode this
# for consistency of answers
critic_lm = ChatOpenAI(model="gpt-4o")
results = evaluate(
dataset=input_ds,
Expand Down

0 comments on commit df441c1

Please sign in to comment.