diff --git a/tests/output.report.json b/tests/output.report.json new file mode 100644 index 0000000..e023261 --- /dev/null +++ b/tests/output.report.json @@ -0,0 +1,179 @@ +{ + "collectors": [ + { + "nodeid": "", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/test_use_json_report.py", + "type": "Module" + } + ] + }, + { + "nodeid": "tests/test_use_json_report.py", + "outcome": "passed", + "result": [ + { + "lineno": 42, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[0]", + "type": "Function" + }, + { + "lineno": 42, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[1]", + "type": "Function" + }, + { + "lineno": 42, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[2]", + "type": "Function" + }, + { + "lineno": 69, + "nodeid": "tests/test_use_json_report.py::test_pass_rate_of_30_words", + "type": "Function" + } + ] + } + ], + "created": 1723531836.549531, + "duration": 17.205761909484863, + "environment": {}, + "exitcode": 1, + "root": "/Users/wenzhe/dev/log10_eval_example", + "summary": { + "collected": 4, + "failed": 1, + "passed": 3, + "total": 4 + }, + "tests": [ + { + "call": { + "crash": { + "lineno": 0, + "message": "\u001b[0mNumber of words is 35, expected <= 30\nassert False\n\u001b[31mtest_use_json_report.py\u001b[0m:65 in test_summarize_to_30_words() -> with check:\n\u001b[31mtest_use_json_report.py\u001b[0m:66 in test_summarize_to_30_words -> assert num_words_less_than_30, f\"Number of words is {num_words}, expected <= 30\"\n\u001b[31mAssertionError: Number of words is 35, expected <= 30\nassert False\n\u001b[0m\n\n------------------------------------------------------------\nFailed Checks: 1", + "path": "tests/test_use_json_report.py::test_summarize_to_30_words[0]" + }, + "duration": 6.695120791089721, + "longrepr": "\u001b[31mFAILURE: \u001b[0mNumber of words is 35, expected <= 30\nassert False\n\u001b[31mtest_use_json_report.py\u001b[0m:65 in test_summarize_to_30_words() -> with check:\n\u001b[31mtest_use_json_report.py\u001b[0m:66 in test_summarize_to_30_words -> assert num_words_less_than_30, f\"Number of words is {num_words}, expected <= 30\"\n\u001b[31mAssertionError: Number of words is 35, expected <= 30\nassert False\n\u001b[0m\n\n------------------------------------------------------------\nFailed Checks: 1", + "outcome": "failed" + }, + "keywords": [ + "test_summarize_to_30_words[0]", + "parametrize", + "pytestmark", + "0", + "test_use_json_report.py", + "tests", + "log10_eval_example", + "" + ], + "lineno": 42, + "metadata": { + "log10": { + "last_completion_id": "https://log10.io/app/test-0/completions/45515561-5fc3-4709-8361-8eadf349e337" + } + }, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[0]", + "outcome": "failed", + "setup": { + "duration": 0.0008069159230217338, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0001188330352306366, + "outcome": "passed" + } + }, + { + "call": { + "duration": 5.655736291082576, + "outcome": "passed" + }, + "keywords": [ + "test_summarize_to_30_words[1]", + "parametrize", + "pytestmark", + "1", + "test_use_json_report.py", + "tests", + "log10_eval_example", + "" + ], + "lineno": 42, + "metadata": { + "log10": { + "last_completion_id": "https://log10.io/app/test-0/completions/29dded7a-2998-405c-a474-684ec0ba1682" + } + }, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[1]", + "outcome": "passed", + "setup": { + "duration": 0.0003689579898491502, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006292080506682396, + "outcome": "passed" + } + }, + { + "call": { + "duration": 4.296570708043873, + "outcome": "passed" + }, + "keywords": [ + "test_summarize_to_30_words[2]", + "parametrize", + "pytestmark", + "2", + "test_use_json_report.py", + "tests", + "log10_eval_example", + "" + ], + "lineno": 42, + "metadata": { + "log10": { + "last_completion_id": "https://log10.io/app/test-0/completions/9d736109-d982-45e5-a013-db7415f66f2e" + } + }, + "nodeid": "tests/test_use_json_report.py::test_summarize_to_30_words[2]", + "outcome": "passed", + "setup": { + "duration": 0.002584167057648301, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004907089751213789, + "outcome": "passed" + } + }, + { + "call": { + "duration": 0.008413875009864569, + "outcome": "passed" + }, + "keywords": [ + "test_pass_rate_of_30_words", + "test_use_json_report.py", + "tests", + "log10_eval_example", + "" + ], + "lineno": 69, + "nodeid": "tests/test_use_json_report.py::test_pass_rate_of_30_words", + "outcome": "passed", + "setup": { + "duration": 0.006148124928586185, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00025579100474715233, + "outcome": "passed" + } + } + ] +} diff --git a/tests/test_use_json_report.py b/tests/test_use_json_report.py new file mode 100644 index 0000000..613820b --- /dev/null +++ b/tests/test_use_json_report.py @@ -0,0 +1,74 @@ +import pytest +from pytest_check import check +import sys +import os +import pandas as pd + +# Append the src directory to sys.path to make its modules available for import +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))) + +import jsonlines +from log10.load import log10_session + +from my_llm import ( + summarize_to_30_words, +) +from my_eval_metrics import cosine_distance, count_words +from report_utils import ( + filter_results_by_test_name, + report_pass_rate, +) + + +@pytest.fixture +def session(): + with log10_session() as session: + assert session.last_completion_id() is None, "No completion ID should be found." + yield session + + +@pytest.fixture +def data(): + filename = "data.jsonl" + data = [] + with jsonlines.open(filename) as reader: + for obj in reader: + data.append((obj["article"], obj["summary"])) + return data + + +# @pytest.mark.repeat(2) +@pytest.mark.parametrize("sample_idx", range(3)) +def test_summarize_to_30_words(data: list, sample_idx: int, results_bag, json_metadata, session): + article, expected_summary = data[sample_idx] + output = summarize_to_30_words(article) + cos_distance = cosine_distance(expected_summary, output) + num_words = count_words(output) + + results_bag.test_name = f"test_summarize_to_30_words_{sample_idx}" + results_bag.article = article + results_bag.expected_summary = expected_summary + results_bag.output = output + results_bag.cos_sim = cos_distance + results_bag.num_words = num_words + results_bag.log10_completion_url = session.last_completion_url() + json_metadata["log10"] = {"last_completion_id": session.last_completion_url()} + + num_words_less_than_30 = num_words <= 30 + results_bag.num_words_less_than_30 = num_words_less_than_30 + + cos_distance_less_than_02 = cos_distance < 0.2 + results_bag.cos_distance_less_than_02 = cos_distance_less_than_02 + + assert num_words_less_than_30, f"Number of words is {num_words}, expected <= 30" + assert cos_distance_less_than_02, f"Cosine distance is {cos_distance}, expected < 0.2" + + +def test_pass_rate_of_30_words(module_results_df: pd.DataFrame): + #save module_results_df to csv + module_results_df.to_csv("module_results_df_080724.csv", index=False) + df = filter_results_by_test_name(module_results_df, "test_summarize_to_30_words") + + pass_rate, pass_rate_report_str = report_pass_rate(df) + + assert pass_rate > 0.66 \ No newline at end of file