import hashlib
import json
import os
from openai import OpenAI
_MODEL = "gpt-4o-mini"
_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
_SYSTEM_PROMPT = (
"You are an impartial judge comparing two candidate responses. "
"Decide which candidate is the better answer. "
"Reply with strict JSON: "
'{"winner": "1" | "2" | "tie", "reason": "<one-sentence justification>"}.'
)
_USER_TEMPLATE = """Candidate 1:
{first}
Candidate 2:
{second}"""
def _seeded_flip(*parts):
"""Return True if output and reference should be swapped before showing to the judge.
Seed is derived deterministically from the inputs so two runs on the same
example produce the same presentation order — important for reproducibility
and for caching at the LLM layer.
"""
digest = hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
return int(digest[:8], 16) % 2 == 1
def evaluate(output, reference):
if not output or not reference:
return {
"label": "missing",
"score": 0.0,
"explanation": "Missing output or reference.",
}
flip = _seeded_flip(str(output), str(reference))
first, second = (reference, output) if flip else (output, reference)
response = _client.chat.completions.create(
model=_MODEL,
temperature=0,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": _SYSTEM_PROMPT},
{
"role": "user",
"content": _USER_TEMPLATE.format(first=first, second=second),
},
],
)
try:
parsed = json.loads(response.choices[0].message.content)
except (json.JSONDecodeError, AttributeError, TypeError) as exc:
return {
"label": "invalid",
"score": 0.0,
"explanation": f"Failed to parse judge response: {exc}",
}
winner_position = str(parsed.get("winner", "")).strip()
reason = str(parsed.get("reason", ""))[:300]
# Decode position back to output / reference using the flip we applied.
if winner_position == "tie":
label = "tie"
elif winner_position == "1":
label = "reference" if flip else "output"
elif winner_position == "2":
label = "output" if flip else "reference"
else:
return {
"label": "invalid",
"score": 0.0,
"explanation": (
f"Judge returned unexpected winner {winner_position!r}; "
f"expected '1', '2', or 'tie'."
),
}
score = {"output": 1.0, "reference": -1.0, "tie": 0.0}[label]
output_position = "2" if flip else "1"
return {
"label": label,
"score": score,
"explanation": (
f"Judge chose position {winner_position} "
f"(output was shown as position {output_position}). {reason}"
),
}