diff --git a/requirements.txt b/requirements.txt index a3e6e7d..0853899 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pandas pandas-stubs lm-eval>=0.4.4 httpx +ragas diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py new file mode 100644 index 0000000..f0445da --- /dev/null +++ b/src/instructlab/eval/ragas.py @@ -0,0 +1,264 @@ +# # SPDX-License-Identifier: Apache-2.0 +# Standard +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional, TypedDict + +# Third Party +from langchain_community.chat_models import ChatOpenAI +from openai import Client as OpenAIClient +from openai.types.chat import ChatCompletionMessageParam +from pandas import DataFrame, read_json +from pydantic import BaseModel, ConfigDict, Field +from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate +from ragas.metrics import Metric +from ragas.metrics._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private + DEFAULT_WITH_REFERENCE_RUBRICS, + RubricsScore, +) + +# Local +from .evaluator import Evaluator +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class Sample(TypedDict): + """ + TypedDict of a sample that we accept when doing eval with Ragas. + We specifically use TypedDict here to be flexible with the input data we accept. + """ + + # question + user_input: str + + # model answer + response: Optional[str] + + # golden answer + reference: str + + +# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object +_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information. +Your primary goal is to answer queries with the most up-to-date and factual information available. +Focus on delivering clear, concise, and correct responses. +If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can. +Your responses should prioritize accuracy over all other considerations.""" + +DEFAULT_SEED = 1337 +DEFAULT_JUDGE_MODEL = "gpt-4o" + + +class ModelConfig(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + + # name of the model to use. + model_name: str + + # The system prompt to be used when applying the chat template. + system_prompt: str = _DEFAULT_SYSTEM_PROMPT + + # "model randomness" aka likelihood of sampling something other than the likeliest token + temperature: float = Field(default=0.0, le=1.0, ge=0.0) + + # Max amount of tokens to generate. + max_tokens: int = 768 + + # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes. + seed: int = DEFAULT_SEED + + +class RagasEvaluator(Evaluator): + # most basic implementation, we just assume that the user will bring the existing model responses + name = "ragas" + + def __init__( + self, + student_model: ModelConfig | None = None, + run_config: RunConfig | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str = DEFAULT_JUDGE_MODEL, + judge_openai_api_key: str | None = None, + ): + self.student_model = student_model + self.run_config = run_config + self.student_openai_client = student_openai_client + self.judge_model_name = judge_model_name + self.judge_openai_api_key = judge_openai_api_key + + @staticmethod + def _validate_dataset(df: DataFrame): + """ + Validates whether or not the given `df` is a valid dataset of `Sample` objects. + + Args: + df (DataFrame): DataFrame containing the dataset to be evaluated. + """ + # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict + # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required. + # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required + required_keys = {"user_input", "reference"} + missing_keys = required_keys - set(df.columns) + if missing_keys: + raise ValueError( + f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}" + ) + + def run( + self, + dataset: List[Sample] | Path, + student_model: ModelConfig | None = None, + run_config: RunConfig | None = None, + student_openai_client: OpenAIClient | None = None, + judge_model_name: str | None = None, + judge_openai_api_key: str | None = None, + ) -> EvaluationResult: + """ + Evaluates the quality of model responses against a graded rubric. + + When the `dataset` lacks the `response` field, then `student_model` must be provided + in order to generate the answers. + + Args: + dataset (List[Sample] | Path): + Can be either a list of `Sample` objects or a path to a jsonl file containing + records matching `Sample`. + student_model: (StudentModelConfig): + When this parameter is provided, we'll attempt to use the described model in order to + generate the responses from the given list of questions. + run_config (RunConfig | None, optional): + Configuration to use when running evaluations. If none is provided, then + a default one is created containing extremely permissive settings when handling + timeouts. This is because by default, OpenAI tier-1 usage accounts have very high + rate limits resulting in heavy throttling during evaluations. + student_openai_client (openai.Client | None, optional): + The client to use when generating questions from the student model, must be compatible with the OpenAI API. + This field is required when `student_model` is provided. + judge_model_name (str | None, optional): + Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified. + judge_openai_api_key (str | None, optional): + The API key to use for evaluating the given dataset. When this isn't provided, `OPENAI_API_KEY` is read instead. + + + Returns: + EvaluationResult: The results of all evaluations performed by Ragas + """ + judge_model_name = ( + judge_model_name if judge_model_name else self.judge_model_name + ) + judge_openai_api_key = ( + judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key + ) + student_model = student_model if student_model else self.student_model + run_config = run_config if run_config else self.run_config + student_openai_client = ( + student_openai_client + if student_openai_client + else self.student_openai_client + ) + + # ensure we are in the dataframe format + input_df = None + if isinstance(dataset, list): + input_df = DataFrame(dataset) + elif isinstance(dataset, Path): + input_df = read_json(dataset, orient="records", lines=True) + else: + raise TypeError(f"invalid type of dataset: {type(dataset)}") + + # this should never happen, but pylint is not smart enough to detect it + if TYPE_CHECKING: + assert input_df is not None + + # ensure the dataset is in the format we expect it + self._validate_dataset(input_df) + + need_to_generate_questions = "response" not in input_df.columns + if need_to_generate_questions: + logger.debug( + "`response` is missing in the input dataframe columns, generating questions from the model is required." + ) + if not student_model or not student_openai_client: + raise ValueError( + "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference" + ) + + # if the student model was provided then we always generate regardless + if student_model: + if not student_openai_client: + raise ValueError( + "`student_model` was specified but `student_openai_client` was not provided" + ) + input_df = self._generate_answers_from_model( + input_df, student_model, student_openai_client + ) + + if not run_config: + # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits + # are horrible and will result in half of our evaluation results being NaN or 0 + run_config = RunConfig( + max_retries=120, + max_wait=7200, + seed=DEFAULT_SEED, + timeout=3600, + ) + + metrics = self._get_metrics() + evaluation_ds = EvaluationDataset.from_pandas(input_df) + + # we will be using gpt-4o for the foreseeable future, we hardcode this + # for consistency of answers + + critic_lm = ChatOpenAI(model=judge_model_name, api_key=judge_openai_api_key) + results = evaluate( + dataset=evaluation_ds, + batch_size=4, + run_config=run_config, + llm=critic_lm, + metrics=metrics, + show_progress=True, + ) + return results + + def _generate_answers_from_model( + self, + questions: DataFrame, + student_model: ModelConfig, + student_openai_client: OpenAIClient, + ) -> DataFrame: + """ + Given a DataFrame containing `user_input` columns, generates responses from the given model + and returns a new DataFrame containing its answers in the `response` column. + """ + # initialize response to write into + updated_df = questions.copy() + updated_df["response"] = "" + + for i, qna in updated_df.iterrows(): + messages: List[ChatCompletionMessageParam] = [ + { + "role": "system", + "content": student_model.system_prompt, + }, + {"role": "user", "content": qna["user_input"]}, + ] + response = student_openai_client.chat.completions.create( + messages=messages, + model=student_model.model_name, + # specify the seed so we can at least try to have some reproducibility when the clients support it + seed=42, + max_tokens=student_model.max_tokens, + temperature=student_model.temperature, + ) + updated_df.at[i, "response"] = response.choices[0].message.content + return updated_df + + @staticmethod + def _get_metrics() -> List[Metric]: + # default set of metrics + return [ + RubricsScore( + rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, + ) + ] diff --git a/tests/test_ragas.py b/tests/test_ragas.py new file mode 100644 index 0000000..1d3bb8f --- /dev/null +++ b/tests/test_ragas.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +from pathlib import Path +from unittest.mock import MagicMock, patch +import unittest + +# Third Party +from pandas import DataFrame +from ragas.callbacks import ChainRun +from ragas.dataset_schema import EvaluationDataset, EvaluationResult + +# First Party +from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig + + +class TestRagasEvaluator(unittest.TestCase): + def setUp(self): + # Common setup data for all tests + self.student_model_response = "Paris" + self.user_question = "What is the capital of France?" + self.golden_answer = "The capital of France is Paris." + self.metric = "mocked-metric" + self.metric_score = 4.0 + self.base_ds = [ + { + "user_input": self.user_question, + "reference": self.golden_answer, + } + ] + self.student_model = ModelConfig( + model_name="super-jeeves-8x700B", + ) + self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) + + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_dataset( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + # Instantiate evaluator + evaluator = RagasEvaluator() + + # Run test + result = evaluator.run( + dataset=self.base_ds, + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), # We pass a mock client + ) + + # Assertions + self.assertIsInstance(result, EvaluationResult) + mock_generate_answers_from_model.assert_called_once() + mock_evaluate.assert_called_once() + # we didn't provide an API key, so it expects to get `api_key=None` + mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None) + + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_dataset_via_path( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator + evaluator = RagasEvaluator() + + # Run test + result = evaluator.run( + dataset=Path("dummy_path.jsonl"), + student_model=self.student_model, + run_config=self.run_config, + student_openai_client=MagicMock(), + ) + + # Assertions + self.assertIsInstance(result, EvaluationResult) + mock_read_json.assert_called_once_with( + Path("dummy_path.jsonl"), orient="records", lines=True + ) + mock_generate_answers_from_model.assert_called() + mock_evaluate.assert_called() + + @patch("instructlab.eval.ragas.ChatOpenAI") + @patch("instructlab.eval.ragas.read_json") + @patch("instructlab.eval.ragas.evaluate") + @patch.object(RagasEvaluator, "_generate_answers_from_model") + @patch.object(RagasEvaluator, "_get_metrics") + def test_run_with_instance_attributes( + self, + mock_get_metrics: MagicMock, + mock_generate_answers_from_model: MagicMock, + mock_evaluate: MagicMock, + mock_read_json: MagicMock, + mock_ChatOpenAI: MagicMock, + ): + """ + Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`, + and `student_openai_client` instead of passing them explicitly. + """ + # Prepare mocks + mock_get_metrics.return_value = [self.metric] + interim_df = DataFrame( + { + "user_input": [self.user_question], + "response": [self.student_model_response], + "reference": [self.golden_answer], + } + ) + mock_generate_answers_from_model.return_value = interim_df + mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) + _unimportant_ragas_traces = { + "default": ChainRun( + run_id="42", + parent_run_id=None, + name="root", + inputs={"system": "null", "user": "null"}, + outputs={"assistant": "null"}, + metadata={"user_id": 1337}, + ) + } + mock_evaluate.return_value = EvaluationResult( + scores=[{self.metric: self.metric_score}], + dataset=mocked_evaluation_ds, + ragas_traces=_unimportant_ragas_traces, + ) + + mock_read_json.return_value = DataFrame(self.base_ds) + + # Instantiate evaluator with instance-level configs + evaluator = RagasEvaluator( + student_model=self.student_model, + student_openai_client=MagicMock(), + run_config=self.run_config, + ) + + # Run test + result = evaluator.run(dataset=Path("dummy_path.jsonl")) + + # Assertions + self.assertIsInstance(result, EvaluationResult) + mock_read_json.assert_called_with( + Path("dummy_path.jsonl"), orient="records", lines=True + ) + mock_generate_answers_from_model.assert_called() + mock_evaluate.assert_called() + + +if __name__ == "__main__": + unittest.main()