Preferences Models
Preferences Models
Preferences define stakeholder objectives and the evaluation machinery used to score
workflow outcomes.
Preference
A single objective with weight, description, and evaluator metadata.
Bases: BaseModel
A single preference dimension with its weight and associated evaluator.
Source code in manager_agent_gym/schemas/preferences/preference.py
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 | class Preference(BaseModel):
"""
A single preference dimension with its weight and associated evaluator.
"""
name: str = Field(..., description="Name of the preference dimension")
weight: float = Field(
ge=0.0, le=1.0, description="Weight/importance of this preference [0,1]"
)
description: str | None = Field(
default=None,
description="Optional description of what this preference measures",
)
evaluator: Evaluator | None = Field(
default=None,
description="Evaluator defining rubrics and aggregation for this preference",
)
def get_rubric_names(self) -> List[str]:
"""Get names of all rubrics in this preference's evaluator."""
if self.evaluator is None:
return []
return [rubric.name for rubric in self.evaluator.rubrics]
|
get_rubric_names() -> List[str]
Get names of all rubrics in this preference's evaluator.
Source code in manager_agent_gym/schemas/preferences/preference.py
| def get_rubric_names(self) -> List[str]:
"""Get names of all rubrics in this preference's evaluator."""
if self.evaluator is None:
return []
return [rubric.name for rubric in self.evaluator.rubrics]
|
Preference Weights
A collection of weighted objectives that drive manager optimization and evaluation.
Bases: BaseModel
A collection of multi-objective preference weights for workflow optimization.
Weights are automatically normalized to sum to 1.0 upon initialization.
Source code in manager_agent_gym/schemas/preferences/preference.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 | class PreferenceWeights(BaseModel):
"""
A collection of multi-objective preference weights for workflow optimization.
Weights are automatically normalized to sum to 1.0 upon initialization.
"""
preferences: List[Preference] = Field(
default_factory=list, description="List of preference dimensions"
)
timestep: int = Field(
default=0, description="Timestep at which these preferences apply"
)
@model_validator(mode="after")
def normalize_weights(self) -> "PreferenceWeights":
total_weight = sum(p.weight for p in self.preferences)
if total_weight > 0:
for p in self.preferences:
p.weight = p.weight / total_weight
elif self.preferences:
equal_weight = 1.0 / len(self.preferences)
for p in self.preferences:
p.weight = equal_weight
return self
def get_preference_names(self) -> List[str]:
"""Get all preference dimension names."""
return [pref.name for pref in self.preferences]
def get_preference_dict(self) -> dict[str, float]:
"""Get preferences as a dictionary mapping name to normalized weight."""
return {pref.name: pref.weight for pref in self.preferences}
def normalize(self) -> "PreferenceWeights":
"""Return a new PreferenceWeights with normalized weights."""
return PreferenceWeights(preferences=[p.model_copy() for p in self.preferences])
def get_preference_summary(self) -> str:
"""Get a summary of the preferences."""
return "\n".join([f"{pref.name}: {pref.weight}" for pref in self.preferences])
|
get_preference_dict() -> dict[str, float]
Get preferences as a dictionary mapping name to normalized weight.
Source code in manager_agent_gym/schemas/preferences/preference.py
| def get_preference_dict(self) -> dict[str, float]:
"""Get preferences as a dictionary mapping name to normalized weight."""
return {pref.name: pref.weight for pref in self.preferences}
|
get_preference_names() -> List[str]
Get all preference dimension names.
Source code in manager_agent_gym/schemas/preferences/preference.py
| def get_preference_names(self) -> List[str]:
"""Get all preference dimension names."""
return [pref.name for pref in self.preferences]
|
get_preference_summary() -> str
Get a summary of the preferences.
Source code in manager_agent_gym/schemas/preferences/preference.py
| def get_preference_summary(self) -> str:
"""Get a summary of the preferences."""
return "\n".join([f"{pref.name}: {pref.weight}" for pref in self.preferences])
|
normalize() -> PreferenceWeights
Return a new PreferenceWeights with normalized weights.
Source code in manager_agent_gym/schemas/preferences/preference.py
| def normalize(self) -> "PreferenceWeights":
"""Return a new PreferenceWeights with normalized weights."""
return PreferenceWeights(preferences=[p.model_copy() for p in self.preferences])
|
Evaluation Result
Structured output from preference evaluators summarising scores and reasoning.
Bases: BaseModel
Comprehensive evaluation result for a single timestep.
Source code in manager_agent_gym/schemas/preferences/evaluation.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 | class EvaluationResult(BaseModel):
"""Comprehensive evaluation result for a single timestep."""
workflow_id: UUID = Field(..., description="ID of the workflow evaluated")
timestep: int = Field(..., description="Timestep of the evaluation")
timestamp: datetime = Field(
default_factory=datetime.now, description="Time of evaluation"
)
preference_scores: dict[str, PreferenceScore] = Field(
..., description="Scores for each preference dimension"
)
evaluation_results: list[RubricGroupResult] = Field(
..., description="Results of all the evaluations run outside of preferences"
)
weighted_preference_total: float = Field(
..., description="Weighted sum of all preference scores"
)
metrics: dict[str, Any] = Field(
default_factory=dict, description="Additional aggregated metrics"
)
def pretty_print(self) -> str:
"""Return a compact, human-readable summary of the evaluation."""
lines: list[str] = []
lines.append(f"Evaluation (timestep={self.timestep})")
# Preferences
if self.preference_scores:
lines.append("- Preferences:")
for name, ps in self.preference_scores.items():
lines.append(
f" • {name}: score={ps.score:.3f}, weight={ps.weight:.3f}"
)
# Workflow-level evaluators
if self.evaluation_results:
lines.append("- Workflow Evaluators:")
for group in self.evaluation_results:
agg = (
f" (agg={group.aggregated_score:.3f}"
f" via {group.aggregation_strategy})"
if group.aggregated_score is not None
else ""
)
lines.append(f" • {group.evaluator_name}{agg}")
for rr in group.rubric_scores[:5]:
lines.append(
f" - {rr.name}: {rr.normalized_score:.3f} (raw={rr.score:.3f}/{rr.max_score:.1f})"
)
if len(group.rubric_scores) > 5:
lines.append(
f" … and {len(group.rubric_scores) - 5} more rubrics"
)
# Utility
lines.append(f"- Total utility: {self.weighted_preference_total:.3f}")
return "\n".join(lines)
|
pretty_print() -> str
Return a compact, human-readable summary of the evaluation.
Source code in manager_agent_gym/schemas/preferences/evaluation.py
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 | def pretty_print(self) -> str:
"""Return a compact, human-readable summary of the evaluation."""
lines: list[str] = []
lines.append(f"Evaluation (timestep={self.timestep})")
# Preferences
if self.preference_scores:
lines.append("- Preferences:")
for name, ps in self.preference_scores.items():
lines.append(
f" • {name}: score={ps.score:.3f}, weight={ps.weight:.3f}"
)
# Workflow-level evaluators
if self.evaluation_results:
lines.append("- Workflow Evaluators:")
for group in self.evaluation_results:
agg = (
f" (agg={group.aggregated_score:.3f}"
f" via {group.aggregation_strategy})"
if group.aggregated_score is not None
else ""
)
lines.append(f" • {group.evaluator_name}{agg}")
for rr in group.rubric_scores[:5]:
lines.append(
f" - {rr.name}: {rr.normalized_score:.3f} (raw={rr.score:.3f}/{rr.max_score:.1f})"
)
if len(group.rubric_scores) > 5:
lines.append(
f" … and {len(group.rubric_scores) - 5} more rubrics"
)
# Utility
lines.append(f"- Total utility: {self.weighted_preference_total:.3f}")
return "\n".join(lines)
|
Workflow Rubric
Declarative rubric configuration for rule-based or LLM-based evaluation hooks.
Bases: BaseModel
Workflow-level rubric that evaluates a workflow using either a Python function
or an LLM prompt. Exactly one evaluation source must be provided.
Source code in manager_agent_gym/schemas/preferences/rubric.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | class WorkflowRubric(BaseModel):
"""
Workflow-level rubric that evaluates a workflow using either a Python function
or an LLM prompt. Exactly one evaluation source must be provided.
"""
name: str = Field(..., description="Name of the rubric")
description: str | None = Field(
default=None, description="Description of what this rubric measures"
)
max_score: float = Field(1.0, gt=0.0, description="Maximum possible score")
evaluator_function: Callable[..., Any] | None = Field(
default=None,
description=(
"Python function taking a workflow and returning either a numeric score,"
" a (score, reasoning) tuple, an EvaluatedScore-like object with 'score' and"
" 'reasoning', or any custom type (captured as raw_output)."
),
)
llm_prompt: str | None = Field(
default=None,
description="LLM prompt to use for evaluation (0..max_score output)",
)
llm_model: str = Field(
default="o3", description="LLM model name to use if llm_prompt is provided"
)
run_condition: RunCondition = Field(
default=RunCondition.EACH_TIMESTEP,
description="When this rubric should be evaluated",
)
required_context: Set[AdditionalContextItem] = Field(
default_factory=set,
description="Optional set of context items this rubric needs at evaluation time",
)
@model_validator(mode="after")
def check_evaluator_source(self) -> "WorkflowRubric":
if self.evaluator_function is None and self.llm_prompt is None:
raise ValueError("Must provide either evaluator_function or llm_prompt")
if self.evaluator_function is not None and self.llm_prompt is not None:
raise ValueError(
"Provide either evaluator_function OR llm_prompt, not both"
)
return self
|