General
PromptBeginner5 minmarkdown
<h1 align="center">
<a href="https://prompts.chat">
5
how to migrate to the new evals interfaces
Sign in to like and favorite skills
The following interfaces are DEPRECATED and should no longer be used:
phoenix.evals.models module (all model classes)phoenix.evals.llm_classify functionphoenix.evals.llm_generate functionphoenix.evals.run_evals functionphoenix.evals.templates.PromptTemplate classphoenix.evals root moduleLegacy documentation: https://arize-phoenix.readthedocs.io/projects/evals/en/latest/api/legacy.html
The new Phoenix Evals API (v2.0+) provides:
phoenix.evals.llm.LLMcreate_classifier and create_evaluatorevaluate_dataframe| DEPRECATED | NEW INTERFACE |
|---|---|
| |
| |
| |
| |
| |
| |
| DEPRECATED | NEW INTERFACE |
|---|---|
| + |
| or custom evaluator |
| |
| DEPRECATED | NEW INTERFACE |
|---|---|
| Raw strings or |
| with parameter |
| DEPRECATED | NEW INTERFACE |
|---|---|
| (new implementation) |
| |
| Create with |
| Create with |
| Create with |
| Create with |
| Create with |
DEPRECATED:
from phoenix.evals import llm_classify from phoenix.evals.models import OpenAIModel from phoenix.evals.templates import PromptTemplate # Old way model = OpenAIModel(model="gpt-4o") template = PromptTemplate( template="Is the response helpful?\n\nQuery: {input}\nResponse: {output}. Respond either as 'helpful' or 'not_helpful'" ) evals_df = llm_classify( data=spans_df, model=model, rails=["helpful", "not_helpful"], template=template, exit_on_error=False, provide_explanation=True, ) # Manual score assignment evals_df["score"] = evals_df["label"].apply(lambda x: 1 if x == "helpful" else 0)
NEW:
import pandas as pd from phoenix.evals import create_classifier, evaluate_dataframe from phoenix.evals.llm import LLM # New way llm = LLM(provider="openai", model="gpt-4o") helpfulness_evaluator = create_classifier( name="helpfulness", prompt_template="Is the response helpful?\n\nQuery: {input}\nResponse: {output}", llm=llm, choices={"helpful": 1.0, "not_helpful": 0.0}, # Automatic scoring ) results_df = evaluate_dataframe( dataframe=spans_df, evaluators=[helpfulness_evaluator], )
DEPRECATED:
from phoenix.evals import llm_classify from phoenix.evals.models import OpenAIModel model = OpenAIModel(model="gpt-4o") # Multiple separate calls relevance_df = llm_classify(data=df, model=model, rails=["relevant", "irrelevant"], ...) helpfulness_df = llm_classify(data=df, model=model, rails=["helpful", "not_helpful"], ...) toxicity_df = llm_classify(data=df, model=model, rails=["toxic", "non_toxic"], ...) # Manual merging required
NEW:
from phoenix.evals import create_classifier, evaluate_dataframe from phoenix.evals.llm import LLM llm = LLM(provider="openai", model="gpt-4o") # Create multiple evaluators relevance_evaluator = create_classifier( name="relevance", prompt_template="Is the response relevant?\n\nQuery: {input}\nResponse: {output}", llm=llm, choices={"relevant": 1.0, "irrelevant": 0.0}, ) helpfulness_evaluator = create_classifier( name="helpfulness", prompt_template="Is the response helpful?\n\nQuery: {input}\nResponse: {output}", llm=llm, choices={"helpful": 1.0, "not_helpful": 0.0}, ) toxicity_evaluator = create_classifier( name="toxicity", prompt_template="Is the response toxic?\n\nQuery: {input}\nResponse: {output}", llm=llm, choices={"toxic": 0.0, "non_toxic": 1.0}, ) # Single call evaluates all metrics results_df = evaluate_dataframe( dataframe=df, evaluators=[relevance_evaluator, helpfulness_evaluator, toxicity_evaluator], )
DEPRECATED:
from phoenix.evals import llm_generate from phoenix.evals.models import OpenAIModel from phoenix.evals.templates import PromptTemplate model = OpenAIModel(model="gpt-4o") template = PromptTemplate(template="Generate a response to: {query}") generated_df = llm_generate( dataframe=df, template=template, model=model, )
NEW:
from phoenix.evals.llm import LLM llm = LLM(provider="openai", model="gpt-4o") # For single generations response = llm.generate_text(prompt="Generate a response to: How do I reset my password?") # For batch processing with dataframes def generate_responses(row): prompt = f"Generate a response to: {row['query']}" return llm.generate_text(prompt=prompt) df['generated_response'] = df.apply(generate_responses, axis=1)
DEPRECATED:
from phoenix.evals import LLMEvaluator from phoenix.evals.models import OpenAIModel class CustomEvaluator(LLMEvaluator): def evaluate(self, input_text, output_text): # Custom logic pass evaluator = CustomEvaluator(model=OpenAIModel(model="gpt-4o"))
NEW:
from phoenix.evals import create_evaluator, LLMEvaluator from phoenix.evals.llm import LLM # Option 1: Function-based evaluator @create_evaluator(name="custom_metric", direction="maximize") def custom_evaluator(input: str, output: str) -> float: # Custom heuristic logic return len(output) / len(input) # Example metric # Option 2: LLM-based evaluator llm = LLM(provider="openai", model="gpt-4o") class CustomLLMEvaluator(LLMEvaluator): def __init__(self): super().__init__( name="custom_llm_eval", llm=llm, prompt_template="Evaluate this response: {input} -> {output}", ) def _evaluate(self, eval_input): # Custom LLM evaluation logic pass
DEPRECATED:
from phoenix.evals.models import OpenAIModel, AnthropicModel, GeminiModel openai_model = OpenAIModel(model="gpt-4o") anthropic_model = AnthropicModel(model="claude-3-sonnet-20240229")
NEW:
from phoenix.evals.llm import LLM # All providers use the same interface openai_llm = LLM(provider="openai", model="gpt-4o") litellm_llm = LLM(provider="litellm", model="claude-3-sonnet-20240229")
When migrating your code:
✅ Update imports
phoenix.evals.models.* with phoenix.evals.llm.LLMphoenix.evals.llm_classify with phoenix.evals.create_classifierphoenix.evals.llm_generate with direct LLM calls✅ Update model instantiation
LLM(provider="...", model="...") interface✅ Replace function calls
llm_classify to create_classifier + evaluate_dataframellm_generate to LLM.generate_textrun_evals to evaluate_dataframe✅ Update templates
PromptTemplate objectsrails parameter with choices dictionary✅ Update evaluators
create_classifier for classification taskscreate_evaluator decorator for custom metricsphoenix.evals.metrics✅ Test the migration