# example.py
import pandas as pd
from phoenix.evals import LLM, create_classifier, evaluate_dataframe
llm = LLM(provider="anthropic", model="claude-sonnet-4-6")
HALLUCINATION_PROMPT = """\
Determine whether the answer below is factually supported by the
reference. Reply with exactly one of: factual, hallucinated.
Question: {input}
Answer: {output}
Reference: {reference}
"""
evaluator = create_classifier(
name="hallucination",
prompt_template=HALLUCINATION_PROMPT,
llm=llm,
# `choices` maps each label the LLM may emit to a numeric score.
# `direction="maximize"` (the default) means higher score is better.
choices={"factual": 1.0, "hallucinated": 0.0},
)
df = pd.DataFrame([
{
"input": "What is the capital of France?",
"output": "Paris is the capital of France.",
"reference": "Paris is the capital and most populous city of France.",
},
{
"input": "What is the capital of France?",
"output": "Berlin is the capital of France.",
"reference": "Paris is the capital and most populous city of France.",
},
])
results = evaluate_dataframe(dataframe=df, evaluators=[evaluator])
# `hallucination_score` is a Score row (a dict-like with `score`, `label`,
# `explanation`, …) — pull the numeric out for a flat display column.
results["score"] = results["hallucination_score"].apply(lambda r: r["score"])
print(results[["input", "output", "score"]].to_string())