from deepeval.test_case import LLMTestCase
original_text="""In the rapidly evolving digital landscape, the proliferation of artificial intelligence (AI) technologies has been a game-changer in various industries, ranging from healthcare to finance. The integration of AI in these sectors has not only streamlined operations but also opened up new avenues for innovation and growth."""
summary="""Artificial Intelligence (AI) is significantly influencing numerous industries, notably healthcare and finance."""
test_case = LLMTestCase(
input=original_text,
actual_output=summary
)
from deepeval.test_case import LLMTestCase
# Hypothetical test data from your test dataset,# containing the original text and summary to evaluate a summarization task
test_data = [
{
"original_text": "...",
"summary": "..."
},
{
"original_text": "...",
"summary": "..."
}
]
test_cases = []
for data in test_data:
test_case = LLMTestCase(
input=data.get("original_text", None),
actual_output=data.get("input", None)
)
test_cases.append(test_case)
最后,批量遍历单元测试用例,使用 DeepEval 与 Pytest 集成,并执行测试文件:
import pytest
from deepeval.metrics import SummarizationMetric
from deepeval import assert_test
@pytest.mark.parametrize("test_case",
test_cases,
)deftest_summarization(test_case: LLMTestCase):
metric = SummarizationMetric()
assert_test(test_case, [metric])
然而,在 LLMs 的准确性测试方面可能需要更加微妙的处理,因为目标标签可能并非非黑即白,对或错。当然,对于像 MMLU 这样的基准测试,目标标签实际上是多项选择题的答案,可以通过精确匹配来轻松量化性能,但在其他情况下我们需要采用更好的方法。例如,请考虑下面的输入示例:'The quick brown fox jumps over the lazy dog.' 'The quick brown fox jumps over the lazy dog.'
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
correctness_metric = GEval(
name="Correctness",
criteria="Determine if the actual output is correct with regard to the expected output.",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
strict_mode=True
)
test_case = LLMTestCase(
input="The dog chased the cat up the tree. Who went up the tree?",
actual_output="Cat",
expected_output="The cat"
)
correctness_metric.measure(test_case)
print(correctness_metric.is_successful())
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
similarity_metric = GEval(
name="Similarity",
criteria="Determine if the actual output is semantically similar to the expected output.",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)
test_case = LLMTestCase(
input="The dog chased the cat up the tree. Who went up the tree?",
actual_output="Cat",
expected_output="The cat"
)
similarity_metric.measure(test_case)
print(similarity_metric.is_successful())