用于测试模型改进能力的测试集
测试集设计概述
测试集用于评估模型在持续迭代过程中的改进效果。一个设计良好的测试集应该能够检测模型在性能上的显著变化,同时保持与训练数据的分布一致性。
测试集构建代码
import numpy as np
import torch
from collections import defaultdict
from typing import List, Dict, Tuple
import json
class IterationTestSet:
"""用于测试模型自我进化能力的测试集"""
def __init__(self, test_data_path: str):
self.test_data = self._load_test_data(test_data_path)
self.baseline_scores = {}
self.iteration_history = defaultdict(list)
def _load_test_data(self, path: str) -> List[Dict]:
"""加载测试数据"""
with open(path, 'r') as f:
data = json.load(f)
return data.get('test_cases', [])
def evaluate_model(self, model, iteration: int) -> Dict:
"""评估模型在当前迭代中的性能"""
results = {
'iteration': iteration,
'total_cases': len(self.test_data),
'passed': 0,
'failed': 0,
'scores': [],
'improvement': 0
}
for case in self.test_data:
try:
score = self._evaluate_single_case(model, case)
results['scores'].append(score)
# 与基线比较
baseline = self.baseline_scores.get(case['id'], 0)
improvement = score - baseline
results['improvement'] += improvement
if score >= case['threshold']:
results['passed'] += 1
else:
results['failed'] += 1
except Exception as e:
print(f"评估失败: {e}")
results['failed'] += 1
return results
def _evaluate_single_case(self, model, case: Dict) -> float:
"""评估单个测试用例"""
# 根据测试类型选择评估方法
test_type = case.get('type', 'qa')
if test_type == 'qa':
return self._evaluate_qa(model, case)
elif test_type == 'generation':
return self._evaluate_generation(model, case)
elif test_type == 'reasoning':
return self._evaluate_reasoning(model, case)
else:
return self._evaluate_default(model, case)
def _evaluate_qa(self, model, case: Dict) -> float:
"""评估问答任务"""
question = case['question']
options = case['options']
correct_answer = case['correct_answer']
# 生成模型答案
with torch.no_grad():
output = model.generate(question, options)
# 评估答案质量
if correct_answer in output:
return 1.0 # 完全正确
elif self._similarity(output, correct_answer) > 0.8:
return 0.8 # 高度相似
elif self._similarity(output, correct_answer) > 0.5:
return 0.5 # 中度相似
else:
return 0.0 # 错误
def _evaluate_generation(self, model, case: Dict) -> float:
"""评估生成任务"""
prompt = case['prompt']
expected_output_type = case.get('output_type', 'text')
# 生成模型输出
with torch.no_grad():
output = model.generate(prompt)
# 评估生成质量
quality_score = self._assess_generation_quality(output, expected_output_type)
return quality_score
def _evaluate_reasoning(self, model, case: Dict) -> float:
"""评估推理任务"""
problem = case['problem']
expected_steps = case['expected_steps']
# 生成模型推理
with torch.no_grad():
reasoning = model.reason(problem)
# 评估推理质量
coherence = self._assess_reasoning_coherence(reasoning, problem)
correctness = self._assess_reasoning_correctness(reasoning, problem)
return (coherence + correctness) / 2
def _assess_generation_quality(self, output: str, expected_type: str) -> float:
"""评估生成质量"""
# 实现质量评估逻辑
if expected_type == 'text':
# 评估文本生成质量
return self._evaluate_text_quality(output)
elif expected_type == 'code':
# 评估代码生成质量
return self._evaluate_code_quality(output)
else:
return 0.5 # 默认分数
def _evaluate_text_quality(self, text: str) -> float:
"""评估文本生成质量"""
# 1. 流畅度
fluency = self._measure_fluency(text)
# 2. 相关性
relevance = self._measure_relevance(text)
# 3. 语法正确性
grammar = self._measure_grammar(text)
# 综合评分
return (fluency + relevance + grammar) / 3
def _measure_fluency(self, text: str) -> float:
"""衡量文本流畅度"""
# 简化实现,实际应使用更复杂的度量
sentences = text.split('.')
if not sentences:
return 0.5
avg_sentence_length = len([s.strip() for s in sentences if s.strip()])
return min(1.0, avg_sentence_length / 50)
def _measure_relevance(self, text: str) -> float:
"""衡量相关性"""
# 简化实现
keywords = ['改进', '性能', '结果', '方法', '算法']
found_keywords = sum(1 for kw in keywords if kw.lower() in text.lower())
return min(1.0, found_keywords / 3)
def _measure_grammar(self, text: str) -> float:
"""衡量语法正确性"""
# 简化实现,实际应使用语言模型
# 检查常见语法问题
issues = 0
if not text[0].isupper():
issues += 1 # 句首大写
if '。' in text:
issues += 0
elif '.' not in text[-1] and text.strip():
issues += 1 # 缺少句号
return max(0, 1.0 - issues * 0.2)
def _evaluate_code_quality(self, code: str) -> float:
"""评估代码生成质量"""
# 1. 语法正确性
try:
compile(code, '')
syntax_score = 1.0
except:
syntax_score = 0.0
# 2. 代码风格
style_score = self._measure_code_style(code)
# 3. 逻辑性
logic_score = 0.7 # 简化评估
return (syntax_score + style_score + logic_score) / 3
def _measure_code_style(self, code: str) -> float:
"""衡量代码风格"""
# 简化实现,检查缩进和命名规范
lines = code.split('\n')
consistent_indent = all(line.startswith(' ') or line.startswith('\t') or not line.strip() for line in lines if line.strip())
# 检查命名规范
has_snake_case = any('_' in name for name in code.split() if name.isidentifier())
return (0.5 * (1 if consistent_indent else 0) + 0.5 * (1 if has_snake_case else 0))
def set_baseline(self, model, iteration: int = 0) -> Dict:
"""设置基线分数"""
self.baseline_scores = {}
for case in self.test_data:
score = self._evaluate_single_case(model, case)
self.baseline_scores[case['id']] = score
return self.baseline_scores
def get_iteration_report(self, iteration: int, model=None) -> Dict:
"""获取迭代报告"""
if model:
results = self.evaluate_model(model, iteration)
self.iteration_history[iteration] = results
return {
'current_iteration': iteration,
'results': results,
'history': dict(self.iteration_history),
'trend': self._calculate_trend()
}
def _calculate_trend(self) -> str:
"""计算性能趋势"""
if not self.iteration_history:
return "no_data"
recent_scores = []
for iter_results in list(self.iteration_history.values())[-5:]:
avg_score = sum(iter_results.get('scores', [0])) / max(1, len(iter_results.get('scores', [0])))
recent_scores.append(avg_score)
if len(recent_scores) < 2:
return "insufficient_data"
if recent_scores[-1] > recent_scores[-2]:
return "improving"
elif recent_scores[-1] < recent_scores[-2]:
return "declining"
else:
return "stable"
# 使用示例
if __name__ == "__main__":
test_set = IterationTestSet("test_data.json")
# 模拟基线设置
class MockModel:
def generate(self, *args, **kwargs):
return "mock response"
def reason(self, problem):
return "mock reasoning"
baseline = test_set.set_baseline(MockModel(), iteration=0)
print(f"基线分数: {baseline}")
# 获取迭代报告
report = test_set.get_iteration_report(1, MockModel())
print(f"迭代报告: {report}")
print(f"性能趋势: {report['trend']}")