from collections.abc import Collection import pytest from datasets import Dataset from pydantic import JsonValue from ether0.data import ( SMILES_PATTERN, get_problem_categories_from_datasets, get_problem_category, ) from ether0.models import RewardFunctionInfo from ether0.rewards import EVAL_FUNCTIONS def test_get_problem_categories_from_datasets(ether0_benchmark_test: Dataset) -> None: assert get_problem_categories_from_datasets(ether0_benchmark_test) == { "functional-group", "molecule-completion", "molecule-formula", "molecule-name", "oracle-solubility", "property-cat-eve", "property-cat-safety", "property-cat-smell", "property-regression-adme", "property-regression-ld50", "property-regression-pka", "reaction-prediction", "retro-synthesis", "simple-formula", } UNVERIFIABLE_PROBLEM_CATEGORY_PREFIXES_TO_EXCLUDE: Collection[str] = { "oracle-solubility", # 'ideal' is not actually an answer "retro-synthesis", # 'ideal' is not actually an answer } def test_evals(ether0_benchmark_test: Dataset) -> None: failures = [] for row in ether0_benchmark_test: reward_info = RewardFunctionInfo.model_validate(row["solution"]) fxn_name, answer_info, problem_type = tuple(reward_info.model_dump().values()) problem_category = get_problem_category(problem_type) if ( problem_category in UNVERIFIABLE_PROBLEM_CATEGORY_PREFIXES_TO_EXCLUDE or problem_category == "molecule-completion" # Molc had no 'ideal's when this was made ): continue metadata: dict[str, JsonValue] = {} try: if problem_category.startswith("property"): yhat = answer_info else: assert row["ideal"] yhat = row["ideal"] assert ( EVAL_FUNCTIONS[fxn_name](yhat=yhat, y=answer_info, metadata=metadata) == 1.0 ) except AssertionError: failures.append((problem_category, row["id"], metadata)) assert not failures TEST_REASONING_TEXT = ( "Let's analyze the given molecules and try to predict their LD50 values. LD50" " refers to the lethal dose at which 50% of the test organisms die. A lower LD50" " means higher toxicity, and a higher LD50 indicates lower toxicity. We need to" " identify structural features that relate to toxicity.\n\nThe question leaves open" " the possibility that none of the compounds have an LD50 of 320 mg/kg. Let's" " consider each molecule individually:\n\n1." " ClC1=C(C=CC(=C1)Cl)C1(OCC(O1)COC1=CC=C(C=C1)N1CCN(CC1)C(C)=O)CN1C=NC=C1: This" " molecule appears to be quite complex. It has a dichloro-substituted aromatic" " ring, an ether linkage, a morpholine ring, a piperazine ring, and an imidazole" " ring. The presence of two chlorine atoms on the phenyl ring could suggest some" " interaction with biological targets. The molecule also has a morpholine and" " piperazine moiety which could contribute to binding with receptors or enzymes." " The presence of an amide group might indicate some polarity, but the overall" " structure looks relatively lipophilic (nonpolar) given the aromatic rings and" " alkyl chains.\n\n2." " ClC1=C(C=CC(=C1)Cl)[C@]1(OC[C@@H](O1)COC1=CC=C(C=C1)N1CCN(CC1)C1=CC=C(C=C1)N1C(N(N=C1)[C@H](CC)C)=O)CN1N=CN=C1:" # noqa: E501 " This is a very complex molecule, with multiple rings, stereocenters, and" " heteroatoms. It's a distinct structure and appears to be larger than the first" " molecule. We can see a furan ring, a pyrazole ring, an amide group, and other" " major differences. This change in the rings and other functional groups is likely" " to significantly change the molecular properties compared to the first" " molecule.\n\n3." " [2H]C(C(=O)N1CCN(CC1)C1=CC=C(C=C1)OCC1O[C@@](OC1)(CN1C=NC=C1)C1=C(C=C(C=C1)Cl)Cl)([2H])[2H]:" # noqa: E501 " This molecule, labeled with deuterium, has multiple rings including a piperazine," " furan, a substituted imidazole, and a dichlorinated phenyl ring. It also includes" " an ester group which is sometimes associated with higher toxicity compared to" " simple ethers.\n\nThinking about general principles of toxicity, lipophilicity" " (fat solubility) is often related to higher toxicity. A molecule with a marked" " lipophilic character can often accumulate in fatty tissues and interact with the" " cell membrane, affect cellular transport or receptor activity. This could lead to" " higher toxicity by interfering with normal cellular function. Similarly, the" " presence of chlorine atoms can sometimes contribute to toxicity due to possible" " metabolic activation to reactive intermediates. However, the position and nature" " of other substituents and functional groups can influence how chlorine" " substitutions modulate toxicity. For example, some chlorinated compounds are" " relatively non-toxic.\n\nConsidering the size and complexity of the molecules, we" " should think about their potential metabolic pathways. Large molecules can be" " metabolized through various pathways, potentially leading to reactive" " intermediates that interact with biological molecules. Metabolites of these" " compounds might be more or less toxic than the initial molecules, and the" " metabolic pathways themselves might be quite different. Perhaps one of the" " metabolites could be the reason for an LD50 of 320 mg/kg. Alternatively, a" " compound might be relatively non-toxic in itself, but its presence can alter" " enzyme activity or other metabolic processes and indirectly lead to cell" " damage.\n\nComparing the three molecules. Molecules 1 and 2 share some structural" " features like the dichloro-substituted aromatic ring and the presence of a" " morpholine ring system. However, they also have distinct differences in the" " connectivity and presence of additional rings, including likely some more polar" " and/or sterically bulky substituents. Molecule 3 has different ring systems and" " the addition of both a deuterated methyl group and an ester group which adds" " polar character and can often activate adjacent portions of the molecule by" " metabolic oxygenation.\n\nLet's think about bioreactivity beyond simple chemical" " interactions. Structures can influence how a molecule interacts with biological" " receptors or enzymes. The size and shape of these molecules and the nature of the" " functional groups can determine the extent of the molecule's binding interactions" " with biomolecules. Some conformationally adaptable structures might bind strongly" " to targets and interfere with crucial pathways, which can lead to toxicity." " Therefore, weaknesses in essential molecular machinery could have similar" " negative effects if bound by those biomolecules.\n\nIf one of these molecules has" " an LD50 of 320 mg/kg, it suggests moderate toxicity. It could be that one of the" " molecules doesn't have the necessary structural features to interact strongly" " with critical biological targets for high toxicity, and/or it might be" " metabolized to relatively non-toxic products, such as carbon dioxide and water." " Thus, while the molecules share some features with other potentially bioactive" " molecules, it could be that they themselves are not exceptionally potent." ) NO_SMILES_TEXT = "This text does not contain any SMILES" @pytest.mark.parametrize( ("text", "expected_answer"), [ ( TEST_REASONING_TEXT, [ "ClC1=C(C=CC(=C1)Cl)C1(OCC(O1)COC1=CC=C(C=C1)N1CCN(CC1)C(C)=O)CN1C=NC=C1", "ClC1=C(C=CC(=C1)Cl)[C@]1(OC[C@@H](O1)COC1=CC=C(C=C1)N1CCN(CC1)C1=CC=C(C=C1)N1C(N(N=C1)[C@H](CC)C)=O)CN1N=CN=C1", "[2H]C(C(=O)N1CCN(CC1)C1=CC=C(C=C1)OCC1O[C@@](OC1)(CN1C=NC=C1)C1=C(C=C(C=C1)Cl)Cl)([2H])[2H]", ], ), ( NO_SMILES_TEXT, [], ), ], ) def test_extract_smiles_from_text(text: str, expected_answer: list[str]) -> None: assert sorted(SMILES_PATTERN.findall(text)) == sorted(expected_answer)