diff --git "a/Ipynb/deal_symbol.ipynb" "b/Ipynb/deal_symbol.ipynb" new file mode 100644--- /dev/null +++ "b/Ipynb/deal_symbol.ipynb" @@ -0,0 +1,4415 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def read_json(file_path): \n", + " with open(file_path, 'r', encoding='utf-8') as file:\n", + " data = json.load(file)\n", + " return data\n", + "\n", + "def write_json(file_path, data):\n", + " with open(file_path, 'w', encoding='utf-8') as file:\n", + " json.dump(data, file, ensure_ascii=False, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# data = read_json('/code/Data/Symbolic_Collection_main.jsonl')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# data_01 = read_json()\n", + "\n", + "jsonl_file_path = '/code/Data/Symbolic_Collection_scale_0.1.jsonl'\n", + "with open(jsonl_file_path, 'r', encoding='utf-8') as f:\n", + " data = [json.loads(line) for line in f]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dataset': 'chebi',\n", + " 'id': 'chebi_7655',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the conjugate base of pelargonidin 3-O-rutinoside; major species at pH 7.3. It is a conjugate base of a pelargonidin 3-O-rutinoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=C(OC4=CC(=O)C=C(C4=C3)O)C5=CC=C(C=C5)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29326',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a nucleoside 3',5'-cyclic phosphate that is 3',5'-cyclic AMP in which the exocyclic amino group on the purine fragment is replaced by hydrogen It is a 3',5'-cyclic purine nucleotide and a nucleoside 3',5'-cyclic phosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=CN=CN=C43)O)OP(=O)(O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2987',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a member of the class of pyridines that is pyridine which is substituted at positions 2 and 5 by o-(tetrazol-5-yl)phenyl and (3,5-dibutyl-1,2,4-triazol-1-yl)methyl groups, respectively. It is a nonpeptide antagonist of angiotensin II, type 1 (AT1) receptors, used for the treatment of hypertension. It has a role as an angiotensin receptor antagonist and an antihypertensive agent. It is a member of tetrazoles, a member of pyridines, a member of triazoles and a member of benzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC1=NN(C(=N1)CCCC)CC2=CN=C(C=C2)C3=CC=CC=C3C4=NNN=N4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22774',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an adenosine 5'-phosphate derivative that has the structure of coenzyme A dephosphorylated at C-3' and with a 5-triphospho-alpha-D-ribosyl substituent at C-2'. It derives from a coenzyme A. It is a conjugate acid of a 2'-(5-triphosphoribosyl)-3'-dephospho-CoA(6-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O[C@@H]4[C@@H]([C@@H]([C@H](O4)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)O)[C@H](C(=O)NCCC(=O)NCCS)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21907',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a tryptamine derivative in which a hydroxy group attached to the same carbon as the primary amino group (the S-enantiomer).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=CN2)C[C@@H](N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11447',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a chiral mycolic acid analogue comprising 3-hydroxypropanoic acid having a tetracosanyl group at position 2 and a further long-chain alkyl group containing cyclopropyl and methoxy functions attached at position 3.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC[C@H]([C@@H](CCCCCCCCCCCCCCCCC[C@@H]1C[C@@H]1CCCCCCCCCCCCCCCC[C@H]([C@H](C)CCCCCCCCCCCCCCCCCC)OC)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17779',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organooxygen compound and an organonitrogen compound. It has a role as a metabolite. It derives from a delta-amino acid.\\nThe corresponding SMILES representation is:\\nCOC(=O)[C@]1(CO[C@]2([C@@H]1CCNC2)C=C)C3=CC4=CC=CC=C4N3\\nThe natural language question is: The molecule is a 7alpha-hydroxy steroid, a 24-hydroxy steroid and a 3-oxo-5beta-steroid. It has a role as a bile acid metabolite. It derives from a hydride of a 5beta-cholestane.\\nThe corresponding SMILES representation is:\\nC[C@H](CCC(C(C)C)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@@H](C[C@H]4[C@@]3(CCC(=O)C4)C)O)C\\nThe natural language question is: The molecule is an omega-hydroxy fatty acid ascaroside that is oscr#30 in which the pro-R hydrogen beta to the carboxy group is replaced by a hydroxy group. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is an omega-hydroxy fatty acid ascaroside, a 3-hydroxy carboxylic acid and a monocarboxylic acid. It derives from an oscr#30 and a (3R)-3,17-dihydroxymargaric acid. It is a conjugate acid of a bhos#30(1-).\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCCCCC[C@H](CC(=O)O)O)O)O\\nThe natural language question is: The molecule is a hydroxydocosahexaenoic acid that consists of (4Z,7Z,10Z,14E,16Z,19Z)-docosahexaenoic acid bearing an additional 13-hydroxy substituent. It has a role as a human xenobiotic metabolite and a mouse metabolite. It is a hydroxydocosahexaenoic acid and a secondary allylic alcohol. It is a conjugate acid of a (4Z,7Z,10Z,14E,16Z,19Z)-13-hydroxydocosahexaenoate.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C(C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a trihydroxyflavone that is flavone substituted by hydroxy groups at positions 5, 7 and 4', a 2-hydroxy-3-methylbut-3-en-1-yl group at position 3' and a prenyl group at position 5'. Isolated from Epimedium sagittatum, it exhibits inhibitory activity against platelet aggregation. It has a role as a metabolite and a platelet aggregation inhibitor. It is a trihydroxyflavone and a secondary alcohol.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=CCC1=C(C(=CC(=C1)C2=CC(=O)C3=C(C=C(C=C3O2)O)O)CC(C(=C)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11639',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a norlignan that is a derivative of agatharesinol in which the aromatic ring B has an additional hydroxy substituent ortho to the one present in the parent compound. It has a role as a metabolite. It derives from an agatharesinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C/[C@@H](C2=CC(=C(C=C2)O)O)[C@@H](CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16215',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an N-acyl-hexosamine that is the methyl alpha-glycoside of the terminal moiety, and presumed antigenic determinant, of the O-specific polysaccharide of Vibrio cholerae O:1, serotype Ogawa. It has a role as an epitope. It is a N-acyl-hexosamine and a methyl mannoside. It derives from an alpha-D-mannose.\\nThe corresponding SMILES representation is:\\nC[C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)OC)OC)O)NC(=O)[C@H](CCO)O\\nThe natural language question is: The molecule is a member of the class of glycerophosphoglycerols obtained by formal condensation of the carboxy group of oleic acid with one of the secondary hydroxy groups of (S,S)-glycero-1-phospho-1'-glycerol It derives from an oleic acid. It is a conjugate acid of a (S,S)-2-oleoylglycero-1-phospho-1'-glycerol(1-). It is an enantiomer of a (R,R)-2-oleoylglycero-1-phospho-1'-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)O[C@@H](CO)COP(=O)(O)OC[C@H](CO)O\\nThe natural language question is: The molecule is an optically active form of prolinium having D-configuration. It has a role as a human metabolite. It is a conjugate acid of a D-proline. It is an enantiomer of a L-prolinium.\\nThe corresponding SMILES representation is:\\nC1C[C@@H]([NH2+]C1)C(=O)O\\nThe natural language question is: The molecule is an alkyl sulfate that is the monomethyl ester of sulfuric acid. It is a one-carbon compound and an alkyl sulfate. It is a conjugate acid of a methyl sulfate(1-).\\nThe corresponding SMILES representation is:\\nCOS(=O)(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an N-glycosyl compound that is a metabolite produced by the bacterium Mycoplasma genitalium. It has a role as a Mycoplasma genitalium metabolite. It is a N-glycosyl compound, an aminopyrimidine, an azabicycloalkane and a pyrimidone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]([C@H](O[C@H]1N2C(C(C(=NC2=O)N)N)C3C=C4N3C(=O)N4[C@H]5C[C@@H]([C@H](O5)COP(=O)(O)O)O)COP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23158',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxychrysene that is chrysene in which the hydrogen at position 2 has been replaced by a hydroxy group. It is a metabolite of the polycyclic aromatic hydrocarbon chrysene. It has a role as a xenobiotic metabolite and a xenoestrogen.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C=CC3=C2C=CC4=C3C=CC(=C4)O\\nThe natural language question is: The molecule is a dipeptide obtained by formal condensation of the alpha-carboxy group of L-aspartic acid with the amino group of methyl L-phenylalaninate. Commonly used as an artificial sweetener. It has a role as a sweetening agent, a nutraceutical, a micronutrient, a xenobiotic, an environmental contaminant, an apoptosis inhibitor and an EC 3.1.3.1 (alkaline phosphatase) inhibitor. It is a dipeptide, a carboxylic acid and a methyl ester. It derives from a L-aspartic acid and a methyl L-phenylalaninate.\\nThe corresponding SMILES representation is:\\nCOC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(=O)O)N\\nThe natural language question is: The molecule is a member of the class of chloroethanes that is ethane substituted by chloro groups at positions 1, 1, 2 and 2.\\nThe corresponding SMILES representation is:\\nC(C(Cl)Cl)(Cl)Cl\\nThe natural language question is: The molecule is a carbapenem antibiotic in which the azetidine and pyrroline rings carry 1-hydroxymethyl and pyrazolo[1,2-a][1,2,4]triazolium-6-ylthio substituents respectively. It has a role as an antibacterial drug. It is a member of carbapenems, a pyrazolotriazole and an organic sulfide.\\nThe corresponding SMILES representation is:\\nC[C@@H]1[C@@H]2[C@H](C(=O)N2C(=C1SC3CN4C=NC=[N+]4C3)C(=O)[O-])[C@@H](C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the anion resulting from the removal of a proton from the carboxylic acid group of clorazepic acid. It is a conjugate base of a clorazepic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C2=NC(C(=O)NC3=C2C=C(C=C3)Cl)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7397',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a tripeptide formed between L-Hcy, L-Lys and L-Leu in a linear sequence, with the peptide linkage between Hcy and Lys being from the carboxy group of the Hcy to the epsilon-nitrogen of the lysine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C[C@@H](C(=O)O)NC(=O)[C@H](CCCCNC(=O)[C@H](CCS)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14785',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pyridone that is 4-hydroxy-6-oxo-1,6-dihydropyridine-2-carboxylic acid substituted by a 3-carboxy-3-oxopropyl group at position 5. It is an oxo dicarboxylic acid, a pyridone and a monohydroxypyridine. It is a conjugate acid of a 5-(3'-carboxy-3'-oxopropyl)-4,6-dihydroxypicolinate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=C(NC(=O)C(=C1O)CCC(=O)C(=O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2444',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 5-[2-(dimethylamino)ethyl]-2-(4-methoxyphenyl)-4-oxo-2,3,4,5-tetrahydro-1,5-benzothiazepin-3-yl acetate in which both stereocentres have R configuration. It is a cGMP-activated K+ channel blocker. It has a role as a potassium channel blocker. It is a conjugate base of an ent-diltiazem(1+). It is an enantiomer of a diltiazem.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)O[C@H]1[C@H](SC2=CC=CC=C2N(C1=O)CCN(C)C)C3=CC=C(C=C3)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28341',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a member of the class of oxazolidines that is 1,3-oxazolidine which is substituted by two methyl groups, dichloroacetyl group and a methyl group at positions 2, 3 and 5, respectively. It is a herbicide safener. It has a role as a herbicide safener. It is a member of oxazolidines, a tertiary carboxamide and an organochlorine compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1CN(C(O1)(C)C)C(=O)C(Cl)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18544',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a myo-inositol monophosphate derivative consisting of 1-O-(6-thiohexylphosphono)-D-myo-inositol having a 2-amino-2-deoxy-alpha-D-glucosyl residue at the 6-position. It is a 2-deoxy-alpha-D-glucoside and a myo-inositol monophosphate derivative. It derives from a myo-inositol.\\nThe corresponding SMILES representation is:\\nC(CCCS)CCOP(=O)(O)O[C@@H]1[C@@H]([C@@H]([C@H]([C@@H]([C@H]1O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)N)O)O)O)O\\nThe natural language question is: The molecule is a C-nitro compound that is nitromethane in which all three hydrogens are replaced by chlorines. It is a severe irritant, and can cause immediate, severe inflammation of the eyes, nose and throat, and significant injuries to the upper and lower respiratory tract. Formerly stockpiled as a chemical warfare agent, it has been widely used in the US as a soil fumigant, particularly for strawberry crops. It is not approved for use within the European Union. It has a role as a fumigant insecticide, a nematicide and an antifungal agrochemical. It is a C-nitro compound, a one-carbon compound and an organochlorine compound.\\nThe corresponding SMILES representation is:\\nC([N+](=O)[O-])(Cl)(Cl)Cl\\nThe natural language question is: The molecule is a lignan isolated from the stems of Sinocalamus affinis. It has a role as a plant metabolite. It is a lignan, a member of oxolanes, a member of methoxybenzenes, a member of phenols and a primary alcohol.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1O)OC)[C@H]2[C@@H]([C@H]([C@@H](O2)C3=CC(=C(C(=C3)OC)OC)OC)CO)CO\\nThe natural language question is: The molecule is a 1-acyl-sn-glycero-3-phospho-1D-myo-inositol(1-) in which the acyl group is specified as hexadecanoyl (palmitoyl); major species at pH 7.3. It is a 1-acyl-sn-glycero-3-phospho-1D-myo-inositol(1-) and a lysophosphatidylinositol 16:0(1-). It is a conjugate base of a 1-hexadecanoyl-sn-glycero-3-phospho-D-myo-inositol.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OC1[C@@H]([C@H](C([C@H]([C@H]1O)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 1,2-diacyl-sn-glycero-3-phosphoethanolamine in which the 1- and 2-acyl groups are specified as oleoyl and myristoyl respectively. It is a 1,2-diacyl-sn-glycero-3-phosphoethanolamine and a tetradecanoate ester. It derives from an oleic acid. It is a tautomer of a 1-oleoyl-2-myristoyl-sn-glycero-3-phosphoethanolamine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)COP(=O)(O)OCCN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15552',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a chromenone that is 2,3-dihydro-7H-chromen-7-one substituted at position 3 by a 2-hydroxy-4-methoxyphenyl group. A reactive intermediate in the biosynthesis of medicarpin. It is a chromenone, a member of phenols and an aromatic ether.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=C(C=C1)[C@@H]2COC3=CC(=O)C=CC3=C2)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28444',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of xanthenes that is 3-imino-3H-xanthen-6-amine in which both nitrogens are carrying methyl substituents. The hydrochloride is the biological dye 'acridine red 3B'. It has a role as a histological dye. It is a xanthene dye, an imine, an aromatic amine and a secondary amino compound. It is a conjugate base of an acridine red 3B(1+).\\nThe corresponding SMILES representation is:\\nCNC1=CC2=C(C=C1)C=C3C=CC(=NC)C=C3O2\\nThe natural language question is: The molecule is a monocarboxylic acid amide obtained by formal condensation of the carboxy group of 3-isobutoxyphenoxyacetic acid with the amino group of 4-methoxyaniline. It has a role as an antifungal agent and a glycerophosphoinositol synthesis inhibitor. It is a monocarboxylic acid amide, an aromatic ether and an aromatic amide. It derives from a phenylacetic acid.\\nThe corresponding SMILES representation is:\\nCC(C)COC1=CC(=CC=C1)OCC(=O)NC2=CC=C(C=C2)OC\\nThe natural language question is: The molecule is an ammonium ion resulting from the protonation of all three amino groups of paromamine. The major species at pH 7.3. It is a conjugate acid of a paromamine.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@@H]([C@H]([C@@H]([C@H]1[NH3+])O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)[NH3+])O)O)[NH3+]\\nThe natural language question is: The molecule is an imidazolidine-2,4-dione that is hydantoin substituted at position 1 by a [(5-nitro-2-furyl)methylene]amino group and at position 3 by a hydroxymethyl group. It has a role as an antiinfective agent, an antibacterial drug and a hepatotoxic agent. It is a nitrofuran antibiotic, an imidazolidine-2,4-dione, an organonitrogen heterocyclic antibiotic and a hydrazone. It derives from a semicarbazide.\\nThe corresponding SMILES representation is:\\nC1C(=O)N(C(=O)N1/N=C/C2=CC=C(O2)[N+](=O)[O-])CO\\nNext, you will be given a sample for test.The natural language question is: The molecule is a carbamate ester that is the 1-vinylcyclohexyl ester of carbamic acid. A short-acting sedative-hypnotic, it was formerly used to treat insomnia. It has a role as a sedative. It is a carbamate ester and a terminal acetylenic compound.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C#CC1(CCCCC1)OC(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6865',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a withanolide that is the 17alpha-hydroxy derivative of withanolide D. Isolated from Tubocapsicum anomalum and Withania somnifera, it exhibits cytotoxic activity. It has a role as an antineoplastic agent. It is a delta-lactone, a 20-hydroxy steroid, a 4-hydroxy steroid, an enone, an ergostanoid, a secondary alcohol, a tertiary alcohol, a withanolide, a 17alpha-hydroxy steroid and an epoxy steroid. It derives from a withanolide D.\\nThe corresponding SMILES representation is:\\nCC1=C(C(=O)O[C@H](C1)[C@@](C)([C@]2(CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3C[C@@H]5[C@]6([C@@]4(C(=O)C=C[C@@H]6O)C)O5)C)O)O)C\\nThe natural language question is: The molecule is an omega-hydroxy-long-chain fatty acid that is pentadecanoic acid in which one of the hydrogens of the terminal methyl group has been replaced by a hydroxy group. It is a straight-chain saturated fatty acid and an omega-hydroxy-long-chain fatty acid. It derives from a pentadecanoic acid. It is a conjugate acid of a 15-hydroxypentadecanoate.\\nThe corresponding SMILES representation is:\\nC(CCCCCCCO)CCCCCCC(=O)O\\nThe natural language question is: The molecule is an imidazolidine that is N-nitroimidazolidin-2-imine bearing a (6-chloro-3-pyridinyl)methyl substituent at position 1. It has a role as a nicotinic acetylcholine receptor agonist, a neonicotinoid insectide, a xenobiotic, an environmental contaminant and a genotoxin. It is a member of imidazolidines and a monochloropyridine. It derives from a 2-chloropyridine.\\nThe corresponding SMILES representation is:\\nC1CN(C(=N[N+](=O)[O-])N1)CC2=CN=C(C=C2)Cl\\nThe natural language question is: The molecule is a delta-lactam that is L-pipecolic acid carrying additional hydroxy and aminomethyl substituents at position 5 as well as an oxo substituent at position 6. It has a role as a bacterial metabolite. It is a delta-lactam, a N-acyl-L-alpha-amino acid and an amino alcohol. It derives from a L-pipecolic acid.\\nThe corresponding SMILES representation is:\\nC1C[C@@](C(=O)N[C@@H]1C(=O)O)(CN)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an oligopeptide comprising pyroglutamyl, histidyl, tryptophyl, seryl, tyrosyl, 1-benzyl-D-histidyl, leucyl, arginyl, and N-ethylprolinamide residues joined in sequence. It is a synthetic nonapeptide analogue of gonadotropin-releasing hormone, and is used as a subcutaneous hydrogel implant (particularly as the diacetate salt) for the treatment of prostate cancer and for the suppression of gonadal sex hormone production in children with central precocious puberty. It has a role as an antineoplastic agent and a gonadotropin releasing hormone agonist.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](CC2=CN(C=N2)CC3=CC=CC=C3)NC(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC5=CNC6=CC=CC=C65)NC(=O)[C@H](CC7=CN=CN7)NC(=O)[C@@H]8CCC(=O)N8'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19185',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an aminotrisaccharide consisting of beta-D-galactopyranosyl, 2-acetamido-beta-D-glucopyranosyl and alpha-L-fucopyranosyl residues joined in sequence by (1->3) and (1->4) glycosidic bonds. It is a member of acetamides and an amino trisaccharide. It derives from a beta-D-Galp-(1->3)-beta-D-GlcpNAc.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O)O)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)NC(=O)C\\nThe natural language question is: The molecule is a dioxo monocarboxylic acid anion obtained by the deprotonation of the carboxylic group of globostellatic acid A. It is a conjugate base of a globostellatic acid A.\\nThe corresponding SMILES representation is:\\nC/C(=C\\\\\\\\C=C\\\\\\\\C(C)(C)O)/C=C/C(=O)/C(=C/1\\\\\\\\C(=O)C[C@@H]2[C@@]1(CC[C@@H]3[C@@]2(CC[C@H]([C@]3(C)C(=O)[O-])OC(=O)C)C)C)/C\\nThe natural language question is: The molecule is a 13-hydroxy-14,15-epoxy-(5Z,8Z,11Z)-icosatrienoate that is the conjugate base of 13-hydroxy-(14R,15S)-epoxy-(5Z,8Z,11Z)-icosatrienoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a 13-hydroxy-(14R,15S)-epoxy-(5Z,8Z,11Z)-icosatrienoic acid.\\nThe corresponding SMILES representation is:\\nCCCCC[C@H]1[C@H](O1)C(/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)[O-])O\\nThe natural language question is: The molecule is a linear amino tetrasaccharide consisting of alpha-fucosyl, beta-galactosyl, beta-glucosaminyl, beta-galactosyl and glucose units connected via sequential (1->2)-, (1->3)-, (1->4)- and (1->4)-linkages. It is an amino pentasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@H]2O[C@@H]3[C@H]([C@@H](O[C@@H]([C@H]3O)CO)O[C@H]4[C@H](O[C@H]([C@@H]([C@H]4O)O)O[C@@H]5[C@H](OC([C@@H]([C@H]5O)O)O)CO)CO)NC(=O)C)CO)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organic heterohexacyclic compound that is a mycotoxic indole alkaloid obtained by prenylation of the 10-hydroxy group of verruculogen. It has a role as a mycotoxin. It is an aromatic ether, a diol, an indole alkaloid, an organic heterohexacyclic compound and an organic peroxide. It derives from a verruculogen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCO[C@H]1C2=C3[C@H](CC(OO[C@@H](N3C4=C2C=CC(=C4)OC)C=C(C)C)(C)C)N5[C@@]1(C(=O)N6CCC[C@H]6C5=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4354',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a pentacyclic triterpenoid that is olean-12-ene in which the hydrogens at the 3beta and 6beta positions have been replaced by hydroxy groups. Found in the seeds of the downy thorn-apple, Datura innoxia. It has a role as a plant metabolite. It is a pentacyclic triterpenoid and a diol. It derives from a hydride of an oleanane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CC[C@@]3(C(=CC[C@H]4[C@]3(C[C@H]([C@@H]5[C@@]4(CC[C@@H](C5(C)C)O)C)O)C)[C@@H]1CC(CC2)(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8860',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a hexadecenoyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (E)-hexadec-2-enoyl-CoA; major species at pH 7.3. It has a role as a human metabolite. It is a hexadecenoyl-CoA(4-) and a 2,3-trans-enoyl CoA(4-). It is a conjugate base of an (E)-hexadec-2-enoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23112',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a bis(azo) compound that is naphthalene-2,7-disulfonic acid in which the hydrogens at positions 3, 4, 5, and 6 are replaced by (p-nitrophenyl)azo, amino, hydroxy, and (p-sulfophenyl)azo groups, respectively. The trisodium salt is the histological dye 'naphthalene blue black CS'. It has a role as a fluorochrome and a histological dye. It is a C-nitro compound, an aminonaphthalenesulfonic acid, a bis(azo) compound, a member of naphthols and a member of azobenzenes. It is a conjugate acid of a 4-amino-5-hydroxy-3-[(4-nitrophenyl)diazenyl]-6-[(4-sulfonatophenyl)diazenyl]naphthalene-2,7-disulfonate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=CC=C1N=NC2=C(C=C3C=C(C(=C(C3=C2N)O)N=NC4=CC=C(C=C4)S(=O)(=O)O)S(=O)(=O)O)S(=O)(=O)O)[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6005',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is trianion arising from deprotonation of all three carboxylic acid groups of (2S,3S)-2-methylcitric acid. It is a conjugate base of a (2S,3S)-2-methylcitric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](C(=O)[O-])[C@@](CC(=O)[O-])(C(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17614',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a trans-3-enoyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (3E)-octenoyl-CoA; major species at pH 7.3. It is a trans-3-enoyl-CoA(4-) and a monounsaturated fatty acyl-CoA(4-). It is a conjugate base of a (3E)-octenoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC/C=C/CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26017',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a hydrochloride composed of equimolar amounts of mexiletine and hydrogen chloride. It has a role as an anti-arrhythmia drug. It contains a mexiletine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=CC=C1)C)OCC(C)[NH3+].[Cl-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8995',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of xanthones that is 9H-xanthene substituted by hydroxy group at positions 1 and 7, an oxo group at position 9 and a geranyloxy group at position 3. It has been isolated from the stems of Cratoxylum cochinchinense. It has a role as a metabolite and a plant metabolite. It is a member of xanthones, a member of phenols and an aromatic ether.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/COC1=CC(=C2C(=C1)OC3=C(C2=O)C=C(C=C3)O)O)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26898',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a nucleotide-sugar oxoanion arising from deprotonation of the diphosphate OH groups of CDP-4-dehydro-3,6-dideoxy-D-glucose; major species at pH 7.3. It is a conjugate base of a CDP-4-dehydro-3,6-dideoxy-D-glucose.\\nThe corresponding SMILES representation is:\\nC[C@@H]1C(=O)C[C@H](C(O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=NC3=O)N)O)O)O\\nThe natural language question is: The molecule is a methyl-branched fatty acid anion that is the conjugate base of 12-methyloctadecanoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a methyl-branched fatty acid anion, a long-chain fatty acid anion and a fatty acid anion 19:0. It is a conjugate base of a 12-methyloctadecanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCC(C)CCCCCCCCCCC(=O)[O-]\\nThe natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH groups and protonation of the amino group of validamine 7-phosphate; major species at pH 7.3. It is a conjugate base of a validamine 7-phosphate.\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H]([C@@H]([C@H]([C@H]1[NH3+])O)O)O)COP(=O)([O-])[O-]\\nThe natural language question is: The molecule is a 3-oxo-5alpha- steroid that is zymosterol which has been substituted by a methyl group at C-4, and in which the 3-hydroxy function has been oxidised to an oxo group. It has a role as a human metabolite, a Saccharomyces cerevisiae metabolite and a mouse metabolite. It is a cholestanoid and a 3-oxo-5alpha-steroid. It derives from a zymosterol.\\nThe corresponding SMILES representation is:\\nCC1[C@@H]2CCC3=C([C@]2(CCC1=O)C)CC[C@]4([C@H]3CC[C@@H]4[C@H](C)CCC=C(C)C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is tetraanion of crotonoyl-CoA arising from deprotonation of phosphate and diphosphate functions. It has a role as a human metabolite. It is a conjugate base of a crotonoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28458',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a 1,2-diacyl-3-beta-D-galactosyl-sn-glycerol in which the groups at the 1- and 2-positions are both oleoyl. It is a 1,2-diacyl-3-beta-D-galactosyl-sn-glycerol and a 1,2-dioleoyl-3-beta-D-galactosylglycerol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](CO[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2867',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a sulfobenzoic acid in which the sulfonic acid and carboxylic acid groups are in a para-relationship. It derives from a benzoic acid. It is a conjugate acid of a 4-sulfobenzoate(1-) and a 4-sulfonatobenzoate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(=O)O)S(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22669',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a limonoid that is the benzoate ester of nimbocinol. It has been isolated from Azadirachta indica. It has a role as a plant metabolite. It is a cyclic terpene ketone, a member of furans, a limonoid, a tetracyclic triterpenoid and a benzoate ester. It derives from a nimbocinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CC[C@@H]3[C@]4(C=CC(=O)C([C@@H]4C[C@H]([C@]3(C1=CC(=O)[C@H]2C5=COC=C5)C)OC(=O)C6=CC=CC=C6)(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6479',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a monocarboxylic acid anion resulting from the deprotonation of the carboxy group of 4-sulfanylbutanoic acid. The major species at pH7.3. It is a conjugate base of a 4-sulfanylbutanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)[O-])CS'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15826',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an oxodocosahexaenoic acid that is (4Z,7Z,10Z,14E,16Z,19Z)-docosahexaenoic acid in which the oxo group is located at position 13. An intermediate of specialised proresolving mediators. It has a role as a human xenobiotic metabolite. It is an oxodocosahexaenoic acid and an enone. It derives from an all-cis-docosa-4,7,10,13,16,19-hexaenoic acid. It is a conjugate acid of a (4Z,7Z,10Z,14E,16Z,19Z)-13-oxodocosahexaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C(=O)C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_399',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a saturated fatty acyl-CoA(4-) obtaned by deprotonation of phosphate and diphosphate functions of tricosanoyl-CoA; major species at pH 7.3. It is a conjugate base of a tricosanoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3798',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is the alpha-amino-acid anion formed by loss of a proton from the carboxy group of 4-hydroxyproline. It is a conjugate base of a 4-hydroxyproline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C(CNC1C(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_149',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an enantiomer of pinoresinol having (+)-1S,3aR,4S,6aR-configuration. It has a role as a hypoglycemic agent, a plant metabolite and a phytoestrogen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)[C@@H]2[C@H]3CO[C@@H]([C@H]3CO2)C4=CC(=C(C=C4)O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25067',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a biaryl that is 5',10a'-dihydro-9H,9'H-2,2'-bixanthene-5,9,9'(10aH)-trione substituted by hydroxy groups at positions 1, 1', 5', 8 and 8' and methyl groups at positions 3, 3', 10a and 10a'. Isolated from the cultures of a Hawaiian isolate of the fungus Phoma species, it exhibits antibacterial and antifungal activities. It has a role as an antibacterial agent and a fungal metabolite. It is a biaryl, a polyphenol, a polyketide and a member of xanthones.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=CC2=C(C(=C1C3=C(C4=C(C=C3C)O[C@]5(C(=O)C=CC(=O)C5=C4O)C)O)O)C(=C6C(=O)C=C[C@@H]([C@@]6(O2)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2500',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a member of the class of oxindoles that is a kinase inhibitor used (in the form of its ethylsulfonate salt) for the treatment of idiopathic pulmonary fibrosis and cancer. It has a role as an antineoplastic agent, a tyrosine kinase inhibitor, a vascular endothelial growth factor receptor antagonist, a fibroblast growth factor receptor antagonist and an angiogenesis inhibitor. It is an aromatic ester, a methyl ester, a member of oxindoles, an enamine, an aromatic amine, an aromatic amide and a N-alkylpiperazine. It is a conjugate base of a nintedanib(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1CCN(CC1)CC(=O)N(C)C2=CC=C(C=C2)N=C(C3=CC=CC=C3)C4=C(NC5=C4C=CC(=C5)C(=O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18455',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is dianion of S-succinylglutathione having anionic carboxy groups and a protonated primary amino group. It is a conjugate base of a S-succinylglutathione.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)N[C@@H](CSC(=O)CCC(=O)[O-])C(=O)NCC(=O)[O-])[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8967',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 2-hydroxy fatty acid anion that is the conjugate base of 2-hydroxypalmitoleic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a 2-hydroxy fatty acid anion, a long-chain fatty acid anion and a monounsaturated fatty acid anion. It derives from a palmitoleate. It is a conjugate base of a 2-hydroxypalmitoleic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C\\\\\\\\CCCCCCC(C(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10045',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a glycosylarabinose consisting of D-mannopyranose and D-arabinofuranose joined in sequence by a (1->5) glycosidic bond. It is a glycosylarabinose and a partially-defined glycan. It derives from a D-mannopyranose and a D-arabinofuranose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H](C(O1)OC[C@@H]2[C@H]([C@@H](C(O2)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2374',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a dihydroxyphenylacetic acid having the two hydroxy substituents at the 2- and 5-positions. It has a role as a human metabolite and a plant metabolite. It is a dihydroxyphenylacetic acid and a member of hydroquinones. It derives from a phenylacetic acid. It is a conjugate acid of a homogentisate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1O)CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1198',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a D-alpha-amino acid that is 6-chlorotryptophan in which the chiral centre has D- (R-) configuration. It is a D-tryptophan derivative, a 6-chlorotryptophan and a D-alpha-amino acid. It is a tautomer of a 6-chloro-D-tryptophan zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1Cl)NC=C2C[C@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13033',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a methylhistidine in which the methyl group is located at N-1. It has a role as a human urinary metabolite. It is a non-proteinogenic alpha-amino acid and a methylhistidine. It is a tautomer of a 1-methylhistidine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1C=C(N=C1)CC(C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26042',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a primary aliphatic ammonium ion which is obtained from streptothricin D by protonation of the guanidino and amino groups. It is a guanidinium ion and a primary aliphatic ammonium ion. It is a conjugate acid of a streptothricin D.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@@H]2[C@@H](C(=O)N1)[NH+]=C(N2)N[C@H]3[C@@H]([C@@H]([C@H]([C@H](O3)CO)OC(=O)N)O)NC(=O)C[C@H](CCCNC(=O)C[C@H](CCCNC(=O)C[C@H](CCC[NH3+])[NH3+])[NH3+])[NH3+])O\\nThe natural language question is: The molecule is an oxo dicarboxylic acid that is (4Z)-hept-4-enedioic acid substituted at position 2 by an oxo group. It is an oxo dicarboxylic acid and an olefinic compound. It is a conjugate acid of a (4Z)-2-oxohept-4-enedioate. It is a tautomer of a (2Z,4Z)-2-hydroxyhepta-2,4-dienedioic acid.\\nThe corresponding SMILES representation is:\\nC(/C=C\\\\\\\\CC(=O)O)C(=O)C(=O)O\\nThe natural language question is: The molecule is a member of the class of chromenes that is 2H-chromene substituted by geminal methyl groups at position 2, methoxy groups at positions 5 and 7 and a (3R)-3-methoxybutanoyl group at position 6. Isolated from the leaves of Mallotus apelta, it exhibits antineoplastic activity. It has a role as a metabolite and an antineoplastic agent. It is an aromatic ether, a member of chromenes and an aromatic ketone.\\nThe corresponding SMILES representation is:\\nC[C@H](CC(=O)C1=C(C=C2C(=C1OC)C=CC(O2)(C)C)OC)OC\\nThe natural language question is: The molecule is propargyl alcohol in which the methylene hydrogens are substituted by ethyl and 2-chlorovinyl groups. A hypnotic and sedative, it is used for treatment of insomnia in some cases where an intolerance or allergy to more commonly used drugs exists. It has a role as a sedative. It is a tertiary alcohol, an organochlorine compound, an enyne and a terminal acetylenic compound.\\nThe corresponding SMILES representation is:\\nCCC(/C=C/Cl)(C#C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a diterpenoid of the xeniaphyllane type isolated from Sinularia gibberosa and has been shown to exhibit antineoplastic activity. It has a role as a metabolite and an antineoplastic agent. It is a diterpenoid, a ketone and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C/1=C\\\\\\\\CCC(=C)[C@@H]2C[C@]([C@H]2CC1)(C)C(=O)/C=C/C(C)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27503',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a phenanthroline. It has a role as an EC 3.4.19.3 (pyroglutamyl-peptidase I) inhibitor and an EC 2.7.1.1 (hexokinase) inhibitor.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7203',\n", + " 'prompt': \"Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a carotenone that consists of beta,beta-carotene bearing four oxo substituents at positions 2, 2', 4 and 4'. It derives from a hydride of a beta-carotene.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=C(C(CC(=O)C1=O)(C)C)/C=C/C(=C/C=C/C(=C/C=C/C=C(/C=C/C=C(/C=C/C2=C(C(=O)C(=O)CC2(C)C)C)\\\\\\\\C)\\\\\\\\C)/C)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15308',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 6-hydroxyheptanoic acid that has R configuration at the chiral centre. It is an enantiomer of a (6S)-6-hydroxyheptanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16585',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a dicarboxylic acid monoanion that is the conjugate base of 2-aminomuconic acid. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It has a role as a human metabolite. It is a conjugate base of a 2-aminomuconic acid. It is a conjugate acid of a 2-aminomuconate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(=C/C(=O)O)\\\\\\\\C=C(\\\\\\\\C(=O)[O-])/N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7541',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of the class of thiazolidinones that is 2-sulfanylidene-1,3-thiazolidin-4-one which is substituted at position 5 by a (4-ethylphenyl)methylidene group. It is a cell permeable inhibitor of c-Myc-Max dimerization and exhibits antitumour effects in vivo. It downregulates c-Myc expression and upregulates CDK inhibitors, p21 and p27 resulting in the inhibition of proliferation, induction of apoptosis and cell cycle arrest in G0/G1 phase. It has a role as an apoptosis inducer and an antineoplastic agent. It is a thiazolidinone and an olefinic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=CC=C(C=C1)/C=C/2\\\\\\\\C(=O)NC(=S)S2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26244',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an organic polycyclic compound that is 1,2,6b,7,8,12b-hexahydroperylene-3,9-dione which is substituted at positions 1, 4, 7, and 10 by hydroxy groups (the all-S isomer). It has a role as an antifungal agent and a fungal metabolite. It is an organic polycyclic compound, an aromatic ketone, a secondary alcohol and a member of phenols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H]([C@H]2C3=C4[C@@H]([C@H](CC(=O)C4=C(C=C3)O)O)C5=C2C(=C(C=C5)O)C1=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13880',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a carbohydrazide obtained by formal condensation of one of the carboxy groups from butylmalonic acid with the hydrazino group of 1,2-diphenylhydrazine. Used (as its calcium semihydrate) for treatment of rheumatoid arthritis. It has a role as a non-steroidal anti-inflammatory drug and an antipyretic. It is a monocarboxylic acid and a carbohydrazide. It derives from a malonic acid. It is a conjugate acid of a bumadizone(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC(C(=O)N(C1=CC=CC=C1)NC2=CC=CC=C2)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14205',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of (S)-lorglumide, obtained by deprotonation of the carboxy group. It is a conjugate base of a (S)-lorglumide. It is an enantiomer of a (R)-lorglumide(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCN(CCCCC)C(=O)[C@H](CCC(=O)[O-])NC(=O)C1=CC(=C(C=C1)Cl)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22130',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an organic sodium salt which is the disodium salt of eosin b diphenol. It has a role as a fluorescent dye and a histological dye. It contains an eosin b(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=O)OC23C4=CC(=C(C(=C4OC5=C(C(=C(C=C35)[N+](=O)[O-])[O-])Br)Br)[O-])[N+](=O)[O-].[Na+].[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2161',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of (S)-2-hydroxybutyric acid, obtained by decarboxylation of the carboxy group. It is a conjugate base of a (S)-2-hydroxybutyric acid.\\nThe corresponding SMILES representation is:\\nCC[C@@H](C(=O)[O-])O\\nThe natural language question is: The molecule is an organic sodium salt that is the disodium salt of N-{4-[2-(2-amino-4-oxo-4,7-dihydro-1H-pyrrolo[2,3-d]pyrimidin-5-yl)ethyl]benzoyl}-L-glutamic acid. Inhibits thymidylate synthase (TS), 421 dihydrofolate reductase (DHFR), and glycinamide ribonucleotide formyltransferase (GARFT). It has a role as an antineoplastic agent, an antimetabolite, an EC 1.5.1.3 (dihydrofolate reductase) inhibitor, an EC 2.1.2.2 (phosphoribosylglycinamide formyltransferase) inhibitor and an EC 2.1.1.45 (thymidylate synthase) inhibitor. It contains a pemetrexed(2-).\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1CCC2=CNC3=C2C(=O)NC(=N3)N)C(=O)N[C@@H](CCC(=O)[O-])C(=O)[O-].[Na+].[Na+]\\nThe natural language question is: The molecule is a cyclic adenyl ribonucleotide that is cAMP (cyclic adenosine 3',5'-phosphate) in which the hydrogen at position 8 has been replaced by a piperidin-1-yl group. A selective cAMP-dependent protein kinase A (PKA) activator and cAMP analogue with high selectivity for site A of PKA type I and for site B of PKA type II. It has a role as an antineoplastic agent and a protein kinase A agonist. It is a 3',5'-cyclic purine nucleotide, an adenyl ribonucleotide, a tertiary amino compound and a member of piperidines. It derives from a 3',5'-cyclic AMP.\\nThe corresponding SMILES representation is:\\nC1CCN(CC1)C2=NC3=C(N=CN=C3N2[C@H]4[C@@H]([C@H]5[C@H](O4)COP(=O)(O5)O)O)N\\nThe natural language question is: The molecule is an alkaloid ester obtained by formal condensation of the hydroxy group of cystodytin E with the carboxy group of elaidic acid. It is isolated from the Okinawan marine tunicate Cystodytes dellechiajei and exhibits cytotoxicity against human epidermoid carcinoma KB cells. It has a role as a metabolite and an antineoplastic agent. It is an alkaloid ester, an enamide, an enone, an organic heterotetracyclic compound and a secondary carboxamide. It derives from a tiglic acid, an elaidic acid and a cystodytin E.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C/CCCCCCCC(=O)OC(CNC(=O)/C(=C/C)/C)C1=CC(=O)C2=NC=CC3=C2C1=NC4=CC=CC=C34\\nNext, you will be given a sample for test.The natural language question is: The molecule is a monohydroxyquinoline in which the hydroxy group is positioned at C-8 with a nitro group trans to it at C-5. It has a role as an antimicrobial agent, an antifungal agent, a renal agent and an antiinfective agent. It is a C-nitro compound and a monohydroxyquinoline.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC2=C(C=CC(=C2N=C1)O)[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7709',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of pyrazoles that is 1,3-dimethylpyrazole which is substituted at positions 4 and 5 by 2,4-dichlorobenzoyl and p-tosyloxy groups, respectively. It is an obsolete proherbicide (via hydrolysis of the tosylate group to afford the corresponding 5-hydroxypyrazole), that was used to control weeds in rice paddy fields. It has a role as a proherbicide, an EC 1.13.11.27 (4-hydroxyphenylpyruvate dioxygenase) inhibitor, an agrochemical and a carotenoid biosynthesis inhibitor. It is a member of pyrazoles, a tosylate ester, an aromatic ketone and a dichlorobenzene.\\nThe corresponding SMILES representation is:\\nCC1=CC=C(C=C1)S(=O)(=O)OC2=C(C(=NN2C)C)C(=O)C3=C(C=C(C=C3)Cl)Cl\\nThe natural language question is: The molecule is an anthracycline cation that is the conjugate acid of aclacinomycin N, obtained by protonation of the tertiary amino group. It is a conjugate acid of an aclacinomycin N and an aclacinomycin N zwitterion.\\nThe corresponding SMILES representation is:\\nCC[C@]1(C[C@@H](C2=C(C3=C(C=C2[C@H]1C(=O)OC)C(=O)C4=C(C3=O)C(=CC=C4)O)O)O[C@H]5C[C@@H]([C@@H]([C@@H](O5)C)O[C@H]6C[C@@H]([C@@H]([C@@H](O6)C)O[C@H]7CC[C@@H]([C@@H](O7)C)O)O)[NH+](C)C)O\\nThe natural language question is: The molecule is a triterpene glycoside that is lanost-8-ene substituted by a methylidene group at position 24 and a beta-D-glucopyranosyloxy group at position 3. Isolated from the whole plant of Silybum marianum, it exhibits inhibitory activity against chymotrypsin. It has a role as an EC 3.4.21.1 (chymotrypsin) inhibitor and a plant metabolite. It is a beta-D-glucoside, a tetracyclic triterpenoid and a triterpenoid saponin.\\nThe corresponding SMILES representation is:\\nC[C@H](CCC(=C)C(C)C)[C@H]1CC[C@@]2([C@@]1(CCC3=C2CC[C@@H]4[C@@]3(CC[C@@H](C4(C)C)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O)C)C)C\\nThe natural language question is: The molecule is a triterpene consisting of 2,6,10,15,19,23-hexamethyltetracosane having six double bonds at the 2-, 6-, 10-, 14-, 18- and 22-positions with (all-E)-configuration. It has a role as a human metabolite, a plant metabolite, a Saccharomyces cerevisiae metabolite and a mouse metabolite.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C=C(/CC/C=C(/CCC=C(C)C)\\\\\\\\C)\\\\\\\\C)/C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a macrolide antibiotic that is 4,10-dimethyl-1-oxa-4-azacyclotridecane-2,5-dione substituted by a 4-(2-hydroxyethoxy)benzyl group at position 3 and a pentyl group at position 13 (the 3S,10R,13S stereoisomer). It is isolated from Penicillium sp.PF1163 and exhibits antifungal activity against the pathogenic fungal strain Candida albicans TIMM1768. It has a role as an antifungal agent and a Penicillium metabolite. It is a macrolide antibiotic, a lactam and an aromatic ether.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@@H]1CC[C@@H](CCCCC(=O)N([C@H](C(=O)O1)CC2=CC=C(C=C2)OCCO)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21269',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a ceramide phosphoinositol compound having a tetracosanoyl group attached to the ceramide nitrogen. It has a role as a Saccharomyces cerevisiae metabolite. It derives from a N-tetracosanoylsphinganine. It is a conjugate acid of an Ins-1-P-Cer(d18:0/24:0)(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)(O)OC1[C@@H]([C@H](C([C@H]([C@H]1O)O)O)O)O)[C@@H](CCCCCCCCCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16459',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a pyrazoloquinoline that is (4aR,8aR)-4,4a,5,6,7,8,8a,9-octahydro-1H-pyrazolo[3,4-g]quinoline substituted by a propyl group at position 5. It acts as a dopamine agonist. It has a role as a dopamine agonist.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCN1CCC[C@H]2[C@H]1CC3=C(C2)NN=C3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5247',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a steroid glucuronide anion that is the conjugate base of 17-epiestriol 3-O-(beta-D-glucuronide) arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a beta-D-glucosiduronate, a steroid glucosiduronic acid anion and a monocarboxylic acid anion. It is a conjugate base of a 17-epiestriol 3-O-(beta-D-glucuronide).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@H]2O)O)CCC4=C3C=CC(=C4)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)[O-])O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13336',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a dichlorophenol with the chloro substituents at positions 2 and 5. It has a role as a human xenobiotic metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1Cl)O)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23172',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a 7alpha-hydroxy steroid, a 12alpha-hydroxy steroid, a cholestanoid and a 3-oxo-Delta(4) steroid. It has a role as a human metabolite and a mouse metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](CC4=CC(=O)CC[C@]34C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27890',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a disaccharide that is D-galactopyranose in which the hydroxy group at position 3 has been converted into the corresponding alpha-D-mannopyranoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O[C@H]2[C@H]([C@H](OC([C@@H]2O)O)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28918',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a C19-gibberellin differing from gibberellin A3 in the presence of a beta-OH at C-9 (gibbane numbering) (C-15 gibberellin numbering). It is a C19-gibberellin, a gibberellin monocarboxylic acid and a lactone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12[C@H](C=C[C@@]3([C@@H]1[C@@H]([C@]45[C@H]3CC[C@](C4)(C(=C)[C@H]5O)O)C(=O)O)OC2=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24141',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a palmitate ester resulting from the formal condensation of the carboxy group of palmitic acid with the hydroxy group of nonan-1-ol. It has a role as a bacterial metabolite. It is a hexadecanoate ester and a wax ester. It derives from a nonan-1-ol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17594',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an organic heteropentacyclic guanidine alkaloid isolated from maringe sponge Monanchora sp. It exhibits anti-HIV-1 and anti-HSV-1 activity. It has a role as an anti-HIV-1 agent, an anti-HSV-1 agent and a marine metabolite. It is a member of guanidines, an alkaloid, a carboxylic ester, a spiro compound, an organic heteropentacyclic compound, a primary amino compound and a monocarboxylic acid amide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H]1C=CCC[C@]2(O1)C[C@@H]3CC[C@H]4N3C(=N[C@]5([C@H]4C(=O)OCCCCCCCCCCCCCCCCCC(=O)N(CCCC(=O)N)CCCN)CCC[C@H](O5)C)N2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15146',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is oxytocin in which the hydrogen on the phenolic hydroxy group is substituted by methyl, the amino group on the cysteine residue is substituted by hydrogen, and the sulfur of the cysteine residue is replaced by a methylene group. A synthetic carba-analogue of oxytocin, it is used to control bleeding after giving birth. Like oxytocin, it causes contraction of the uterus. It has a role as an oxytocic.\\nThe corresponding SMILES representation is:\\nCC[C@H](C)[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CSCCCC(=O)N[C@H](C(=O)N1)CC2=CC=C(C=C2)OC)C(=O)N3CCC[C@H]3C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N)CC(=O)N)CCC(=O)N\\nThe natural language question is: The molecule is a member of the class of resorcinols that is resorcinol which is substituted by a (2E)-3,7-dimethylocta-2,6-dien-1-yl group at position 2 and by a pentyl group at position 5. It is a natural product found in Cannabis sativa and Helichrysum species. It has a role as an appetite enhancer, a plant metabolite, a cannabinoid receptor agonist, an anti-inflammatory agent, an antibacterial agent, a neuroprotective agent and an antioxidant. It is a phytocannabinoid and a member of resorcinols.\\nThe corresponding SMILES representation is:\\nCCCCCC1=CC(=C(C(=C1)O)C/C=C(\\\\\\\\C)/CCC=C(C)C)O\\nThe natural language question is: The molecule is a lignan that is 3,4-dimethyloxolane substituted by a 2-methoxyphenol group at position 5 and a 2,6-dimethoxyphenol group at 2. It has been isolated from the bark of Machilus robusta. It has a role as a plant metabolite. It is a lignan, a dimethoxybenzene, a member of phenols and a member of oxolanes.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@@H](O[C@@H]1C2=CC(=C(C=C2)O)OC)C3=CC(=C(C(=C3)OC)O)OC)C\\nThe natural language question is: The molecule is a steroid glucosiduronic acid that is 2-hydroxyestrone having a single beta-D-glucuronic acid residue attached at position 3. It is a beta-D-glucosiduronic acid, a 2-hydroxy steroid, a steroid glucosiduronic acid and a 17-oxo steroid. It derives from a 2-hydroxyestrone. It is a conjugate acid of a 2-hydroxyestrone 3-O-(beta-D-glucuronide)(1-).\\nThe corresponding SMILES representation is:\\nC[C@]12CC[C@H]3[C@H]([C@@H]1CCC2=O)CCC4=CC(=C(C=C34)O)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organobromine compound that is fluoranthene in which the hydrogens at positions 7 and 10 are substituted by 4-bromophenyl groups, while those at positions 8 and 9 are substituted by 4-octylphenyl groups. It derives from a hydride of a fluoranthene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCC1=CC=C(C=C1)C2=C(C(=C3C4=CC=CC5=C4C(=CC=C5)C3=C2C6=CC=C(C=C6)Br)C7=CC=C(C=C7)Br)C8=CC=C(C=C8)CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26721',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a butadiene with unsaturation at positions 1 and 3. It has a role as a carcinogenic agent and a mutagen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C=CC=C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4702',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a polyunsaturated fatty acid anion that is the conjugate base of 19-HEPE, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a hydroxy fatty acid anion, a polyunsaturated fatty acid anion, a long-chain fatty acid anion, an (omega-1)-hydroxy fatty acid anion and a HEPE(1-). It derives from an all-cis-5,8,11,14,17-icosapentaenoate. It is a conjugate base of a 19-HEPE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15336',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a cardenolide glycoside that is digitoxin beta-hydroxylated at C-12. A cardiac glycoside extracted from the foxglove plant, Digitalis lanata, it is used to control ventricular rate in atrial fibrillation and in the management of congestive heart failure with atrial fibrillation, but the margin between toxic and therapeutic doses is small. It has a role as an epitope, an anti-arrhythmia drug, a cardiotonic drug and an EC 3.6.3.9 (Na(+)/K(+)-transporting ATPase) inhibitor. It is a cardenolide glycoside and a steroid saponin. It is a conjugate acid of a digoxin(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@H]([C@H](C[C@@H](O1)O[C@@H]2[C@H](O[C@H](C[C@@H]2O)O[C@@H]3[C@H](O[C@H](C[C@@H]3O)O[C@H]4CC[C@]5([C@@H](C4)CC[C@@H]6[C@@H]5C[C@H]([C@]7([C@@]6(CC[C@@H]7C8=CC(=O)OC8)O)C)O)C)C)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9920',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a member of the class of guanidines that consists of guanidine carrying a methyl substituent at position 1, a cyano group at position 2 and a 2-{[(5-methyl-1H-imidazol-4-yl)methyl]sulfanyl}ethyl group at position 3. It is a H2-receptor antagonist that inhibits the production of acid in stomach. It has a role as a H2-receptor antagonist, a P450 inhibitor, an anti-ulcer drug, an analgesic and an adjuvant. It is a member of guanidines, a member of imidazoles, an aliphatic sulfide and a nitrile.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(N=CN1)CSCCNC(=NC)NC#N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11268',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the diphosphate OH groups of N-acetyl-alpha-D-glucosaminyl-1-diphospho-trans,polycis-decaprenol; major species at pH 7.3. It is a conjugate base of a N-acetyl-alpha-D-glucosaminyl-1-diphospho-trans,polycis-decaprenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])OP(=O)([O-])O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14643',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a peptide anion that is the conjugate base of gamma-Glu-Leu, obtained by removal of protons from the two carboxy groups as well as protonation of the amino group; major species at pH 7.3. It is a conjugate base of a gamma-Glu-Leu.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C[C@@H](C(=O)[O-])NC(=O)CC[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7794',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a bile acid glycine conjugate having hyocholic acid as the bile acid component. It has a role as a human metabolite. It derives from a hyocholic acid. It is a conjugate acid of a glycohyocholate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@@H]([C@@H]([C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23133',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a fatty acid-taurine conjugate derived from hexadecanoic acid. It has a role as a mouse metabolite. It derives from a hexadecanoic acid. It is a conjugate acid of a N-hexadecanoyltaurine(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)NCCS(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14050',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 14,15-EET in which the epoxy moiety has 14S,15R-configuration. It is a conjugate acid of a (14S,15R)-EET(1-). It is an enantiomer of a (14R,15S)-EET.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@@H]1[C@@H](O1)C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21615',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of furans that is 5-methyl-2-methylenefuran carrying additional oxo and hydroxy groups at positions 3 and 4 respectively. It is a member of furans, an enol, a cyclic ketone and an enone. It is a tautomer of a 2-methyl-5-methylenefuran-3,4-dione.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=O)C(=C)O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3288',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an aci-nitro compound resulting from the tautomerisation of the nitro group of 3-nitropropanoic acid. It is a conjugate acid of a 3-(dioxido-lambda(5)-azanylidene)propanoate(2-) and a 3-aci-nitropropanoate. It is a tautomer of a 3-nitropropanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(/C=[N+](/O)\\\\\\\\[O-])C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27251',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a fluorotelomer that is dec-2-enoic acid substituted by fluoro groups at positions 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10 and 10 respectively. It has a role as a xenobiotic and a persistent organic pollutant. It is an alpha,beta-unsaturated monocarboxylic acid and a fluorotelomer.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(=C(/C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)\\\\\\\\F)\\\\\\\\C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24180',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is the xanthosine 5'-phosphate in which the 5'-phosphate is a triphosphate group. It has a role as an Escherichia coli metabolite and a mouse metabolite. It is a purine ribonucleoside 5'-triphosphate and a xanthosine 5'-phosphate. It is a conjugate acid of a XTP(3-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)NC(=O)NC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3970',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 2',3'-cyclic purine nucleotide. It has a role as an Escherichia coli metabolite. It is a conjugate acid of a 2',3'-cyclic AMP(1-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@H]4[C@@H]([C@H](O3)CO)OP(=O)(O4)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3025',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a dicarboxylic acid dianion obtained by deprotonation of both carboxy groups of octadecanedioic acid. It is a conjugate base of an octadecanedioic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCCCCC(=O)[O-])CCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17250',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a steroid consisting of cevane having an oxygen bridge between positions 4 and 9 and carrying seven additional hydroxy substituents. It has a role as an insecticide. It derives from a hydride of a cevane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1CC[C@H]2[C@@]([C@]3([C@H](C[C@]4([C@@H]5CC[C@H]6[C@]7([C@]5(C[C@]4([C@@H]3CN2C1)O)O[C@@]6([C@@H](CC7)O)O)C)O)O)O)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23393',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a nucleotide-sugar having ADP as the nucleotide fragment and D-ribofuranos-5-yl as the sugar component. It has a role as an Escherichia coli metabolite and a mouse metabolite. It derives from an ADP. It is a conjugate acid of an ADP-D-ribose(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)OP(=O)(O)OC[C@@H]4[C@H]([C@H](C(O4)O)O)O)O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4948',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a cardenolide glycoside consisting of digitoxigenin having an alpha-L-rhamnosyl moiety attached at the O(3)-position. It derives from a digitoxigenin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@H]2CC[C@]3([C@@H](C2)CC[C@@H]4[C@@H]3CC[C@]5([C@@]4(CC[C@@H]5C6=CC(=O)OC6)O)C)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11588',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a monohydroxybenzoic acid consisting of salicylic acid carrying two methyl groups at the 3 and 6 positions. It derives from a salicylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=C(C=C1)C)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8397',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a mercaptopropanoic acid that is propanoic acid carrying a sulfanyl group at position 3. It has a role as an algal metabolite. It is a conjugate acid of a 3-mercaptopropionate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CS)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12244',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH groups of isopentenyl phosphate; major species at pH 7.3. It is a conjugate base of an isopentenyl phosphate.\\nThe corresponding SMILES representation is:\\nCC(=C)CCOP(=O)([O-])[O-]\\nThe natural language question is: The molecule is a long-chain fatty acid anion that is the conjugate base of 5-PAHSA, obtained by deprotonation of the carboxy group; major species at pH 7.3. It has a role as an anti-inflammatory agent, a hypoglycemic agent and a human metabolite. It is a conjugate base of a 5-PAHSA.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCCC)CCCC(=O)[O-]\\nThe natural language question is: The molecule is a 1,2-diacyl-sn-glycerol where oleoyl and arachidonoyl are the 1- and 2-acyl groups respectively. It has a role as a mouse metabolite. It derives from an oleic acid and an arachidonic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](CO)OC(=O)CCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nThe natural language question is: The molecule is an RNA fragment comprised of two guanosine, two adenosine and three cytidine residues connected by 3'->5' phosphodiester linkages in the sequence G-A-G-A-C-C-C.\\nThe corresponding SMILES representation is:\\nC1=CN(C(=O)N=C1N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O[C@@H]3[C@H](O[C@H]([C@@H]3O)N4C=CC(=NC4=O)N)COP(=O)(O)O[C@@H]5[C@H](O[C@H]([C@@H]5O)N6C=CC(=NC6=O)N)COP(=O)(O)O[C@@H]7[C@H](O[C@H]([C@@H]7O)N8C=NC9=C(N=CN=C98)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C1N=C(NC2=O)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C(N=CN=C21)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C1N=C(NC2=O)N)CO)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a lathyrane diterpenoid isolated from the roots of Euphorbia micractina. It is a lathyrane diterpenoid, an epoxide, a cinnamate ester and a tertiary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C[C@H]1C[C@]2([C@H]([C@H]1OC(=O)/C=C/C3=CC=CC=C3)[C@@H]4[C@](O4)(CC[C@H]5[C@H](C5(C)C)/C=C(/C2=O)\\\\\\\\C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10759',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a branched amino hexasaccharide made up from three galactose residues, two glucosamine residues and one glucose residue (at the reducing end); a constituent of human breast milk. It is an amino hexasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC[C@@H]2[C@@H]([C@@H]([C@H]([C@@H](O2)O[C@@H]3[C@H](OC([C@@H]([C@H]3O)O)O)CO)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O[C@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O)O)NC(=O)C)O)CO)O[C@H]6[C@@H]([C@H]([C@H]([C@H](O6)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2752',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a cholanic acid conjugate anion that is the conjugate base of glycohyocholic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It has a role as a human metabolite. It is a cholanic acid conjugate anion and a N-acylglycinate. It is a conjugate base of a glycohyocholic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)NCC(=O)[O-])[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@@H]([C@@H]([C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27552',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an indolyl carbohydrate that is cyclodopa in which the phenolic hydrogen at position 5 has been replaced by a beta-D-glucosyl residue. It has a role as a mouse metabolite, a rat metabolite and a plant metabolite. It is a beta-D-glucoside, a member of phenols, an indolyl carbohydrate and an indolyl carboxylic acid. It derives from a leucodopachrome.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H](NC2=CC(=C(C=C21)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11205',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an amino trisaccharide consisting of a 3-deoxy-D-manno-oct-2-ulose residue and two glucosamine residues (one at the reducing end) in a linear sequence, with three phosphate groups attached. Isolated from the lipopolysaccharide obtained from Haemophilus influenzae.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@H]([C@H](O[C@]1(C(=O)O)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC[C@@H]3[C@H]([C@@H]([C@H]([C@H](O3)OP(=O)(O)O)N)O)O)N)O)OP(=O)(O)O)[C@@H](CO)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15357',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the L-enantiomer of N-acetylcitrulline. It is a N-acetylcitrulline and a N-acetyl-L-amino acid. It derives from a L-citrulline. It is a conjugate acid of a N-acetyl-L-citrullinate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H](CCCNC(=O)N)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21371',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an N-acyl-1-O-beta-D-glucosyl-4-hydroxy-15-methylhexadecasphinganine in which the acyl group has 21 carbons and 0 double bonds and is 2-hydroxylated. It derives from a 15-methylhexadecaphytosphingosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCC(C(=O)N[C@@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)[C@@H]([C@@H](CCCCCCCCCCC(C)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24278',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 1,1'-lysobisphosphatidic acid in which both acyl groups are specified as oleoyl. It derives from an oleic acid. It is a conjugate acid of a (S,S)-bis-(2-oleoylglycero)-1-phosphate(1-). It is an enantiomer of a (R,R)-bis(2-oleoylglycero)-3-phosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)O[C@H](COP(=O)(OC[C@@H](OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)CO)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16842',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an aldehyde that is acetaldehyde substituted by a 5-hydroxyindol-3-yl group. It has a role as a mouse metabolite and a human metabolite. It is a member of hydroxyindoles and an indoleacetaldehyde.\\nThe corresponding SMILES representation is:\\nC1=CC2=C(C=C1O)C(=CN2)CC=O\\nThe natural language question is: The molecule is a non-proteinogenic amino acid derivative that is methyl alaninate substituted by a tert-butoxycarbonyl group at the N and a 2-(2,6-dichlorophenyl)-4-(phenylsulfanyl)-1,2,3,4,4a,8a-hexahydro-6-quinolyl group at position 3. It is a member of quinolines, a dichlorobenzene, a methyl ester, a carbamate ester, an organic sulfide and a non-proteinogenic amino acid derivative. It contains a phenylsulfanyl group. It derives from a tert-butanol.\\nThe corresponding SMILES representation is:\\nCC(C)(C)OC(=O)NC(CC1=CC2C(CC(NC2C=C1)C3=C(C=CC=C3Cl)Cl)SC4=CC=CC=C4)C(=O)OC\\nThe natural language question is: The molecule is an X-rhodamine triethylammonium salt having a carboxy substituent at the 5-position. It has a role as a fluorochrome. It is an organic heteroheptacyclic compound and an organoammonium salt. It contains a 5-carboxy-X-rhodamine and a triethylammonium ion. It derives from a hydride of a 2,3,6,7,12,13,16,17-octahydropyrido[3,2,1-ij]quinolizino[1',9':6,7,8]chromeno[2,3-f]quinolin-18-ium.\\nThe corresponding SMILES representation is:\\nCC[NH+](CC)CC.C1CC2=CC3=C(C4=C2N(C1)CCC4)OC5=C6CCC[N+]7=C6C(=CC5=C3C8=C(C=C(C=C8)C(=O)[O-])C(=O)[O-])CCC7\\nThe natural language question is: The molecule is a D-mannopyranose in which the anomeric centre has beta-configuration. It has a role as an epitope. It is an enantiomer of a beta-L-mannose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@@H]([C@@H](O1)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a pteroate that is the conjugate base of (6S)-5,6,7,8-tetrahydropteroic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a (6S)-5,6,7,8-tetrahydropteroic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H](NC2=C(N1)N=C(NC2=O)N)CNC3=CC=C(C=C3)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27712',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a synthetic pentasaccharide which, apart from the O-methyl group at the reducing end of the molecule, consists of monomeric sugar units which are identical to a sequence of five monomeric sugar units that can be isolated after either chemical or enzymatic cleavage of the polymeric glycosaminoglycans heparin and heparan sulfate. It has a role as an anticoagulant. It is an amino sugar, an oligosaccharide sulfate and a pentasaccharide derivative. It derives from a normethylfondaparinux. It is a conjugate acid of a fondaparinux(10-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CO[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)COS(=O)(=O)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@@H](O2)C(=O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)COS(=O)(=O)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C(=O)O)O[C@@H]5[C@@H]([C@H]([C@@H]([C@H](O5)COS(=O)(=O)O)O)O)NS(=O)(=O)O)O)O)OS(=O)(=O)O)NS(=O)(=O)O)O)OS(=O)(=O)O)O)NS(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13402',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is the zwitterion formed from cis-3-hydroxy-L-proline by proton transfer from the carboxy group to the ring nitrogen. It is the predominant species at physiological pH. It is a tautomer of a cis-3-hydroxy-L-proline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[NH2+][C@@H]([C@@H]1O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22476',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an arenesulfonic acid that is benzenesulfonic acid in which the hydrogen at position 4 is replaced by a methyl group. It is a member of toluenes and an arenesulfonic acid. It is a conjugate acid of a toluene-4-sulfonate.\\nThe corresponding SMILES representation is:\\nCC1=CC=C(C=C1)S(=O)(=O)O\\nThe natural language question is: The molecule is the cationic form of a C3 cyanine dye having 1,3-diethyl-5,6-dichloroindoleinine units at each end. It has a role as a fluorochrome. It is a cyanine dye and an indolium ion.\\nThe corresponding SMILES representation is:\\nCCN1C2=CC(=C(C=C2[N+](=C1/C=C/C=C3N(C4=CC(=C(C=C4N3CC)Cl)Cl)CC)CC)Cl)Cl\\nThe natural language question is: The molecule is an N-acetyl-beta-D-glycosaminyl glycopeptide consisting of an N-acetyl-beta-D-glycosaminyl-(1->4)-N-acetylmuramoyl moiety attached to the amino terminus of the pentapeptide L-Ala-gamma-D-Glu-L-Lys-D-Ala-D-Ala via an amide linkage.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@H](CCC(=O)N[C@@H](CCCCN)C(=O)N[C@H](C)C(=O)N[C@H](C)C(=O)O)C(=O)O)NC(=O)[C@@H](C)O[C@H]1[C@@H]([C@H](OC([C@@H]1NC(=O)C)O)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)NC(=O)C\\nThe natural language question is: The molecule is a quercetin O-glycoside that is quercetin attached to a alpha-L-fucopyranosyl moiety at position 3 via a glycosidic linkage. It has a role as a metabolite. It is a monosaccharide derivative, a tetrahydroxyflavone, an alpha-L-fucoside and a quercetin O-glycoside. It derives from an alpha-L-fucose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)OC2=C(OC3=CC(=CC(=C3C2=O)O)O)C4=CC(=C(C=C4)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an amino trisaccharide consisting of alpha-L-fucopyranose, 2-O-methyl-beta-D-galactopyranose and 2-acetamido-2-deoxy-beta-D-glucopyranose residues joined in sequence by (1->3) and (1->4) glycosidic bonds. It is an amino sugar, an amino trisaccharide and a member of acetamides.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@H]2[C@H]([C@H](O[C@H]([C@@H]2OC)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O)NC(=O)C)O)CO)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9795',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a pyrimidine ribonucleoside 5'-monophosphate having 5-carboxymethylamino-2-thiouracil as the nucleobase. It has a role as a Mycoplasma genitalium metabolite. It is a glycine derivative and a pyrimidine ribonucleoside 5'-monophosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=C(C(=O)NC(=S)N1[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O)O)O)CNCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3716',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the organosulfonate oxoanion that is the tetraanionic form of the azo dye remazole black-GR. It has a role as a dye. It is an organosulfonate oxoanion, a sulfone and a bis(azo) compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1N=NC2=C(C3=C(C(=C(C=C3C=C2S(=O)(=O)[O-])S(=O)(=O)[O-])N=NC4=CC=C(C=C4)S(=O)(=O)CCOS(=O)(=O)[O-])N)O)S(=O)(=O)CCOS(=O)(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18310',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a monounsaturated fatty acid that is nonadecanoic acid with a double bond at position 10. It has a role as a human metabolite. It is a long-chain fatty acid, a monounsaturated fatty acid and a straight-chain fatty acid. It is a conjugate acid of a 10-nonadecenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C/CCCCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22812',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 2,3-trans-enoyl CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (2E,11Z,14Z)-icosatrienoyl-CoA; major species at pH 7.3. It is a conjugate base of a (2E,11Z,14Z)-icosatrienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22545',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a withanolide that is 22,26-epoxyergosta-1,4-diene substituted by an acetyloxy group at position 18 and oxo groups at positions 3 and 26. Isolated from a Formosan soft coral Paraminabea acronocephala, it has been found to inhibit the accumulation of the pro-inflammatory iNOS protein. It has a role as a coral metabolite and an EC 1.14.13.39 (nitric oxide synthase) inhibitor. It is a delta-lactone, an acetate ester, an ergostanoid, a withanolide, a steroid ester and a 3-oxo-Delta(1),Delta(4)-steroid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C[C@@H](OC(=O)[C@H]1C)[C@@H](C)[C@H]2CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3CCC5=CC(=O)C=C[C@]45C)COC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27715',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a tripeptide that is L-seryl-L-valyl-L-leucine in which the amino group at the N-terminal is substituted by a tert-butoxycarbonyl group and the carboxy group at the C-terminal is substituted by a {(2S,3E)-5-ethoxy-5-oxo-1-[(3S)-2-oxopyrrolidin-3-yl]pent-3-en-2-yl}amino group. It is a 3C-like protease inhibitor of MERS-CoV and SARS-CoV. It has a role as an anticoronaviral agent and an EC 3.4.22.69 (SARS coronavirus main proteinase) inhibitor. It is a tripeptide, a member of pyrrolidin-2-ones, a tert-butyl ester, an ethyl ester, an enoate ester and a secondary carboxamide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CO)NC(=O)OC(C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18406',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an N-(2-naphthyl)carboxamide obtained by formal condensation of the carboxy group of N-carbobenzyloxy-glycylglycyl-L-arginine with the amino group of 2-naphthylamine. It has a role as a chromogenic compound. It is a N-(2-naphthyl)carboxamide and a tripeptide.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)COC(=O)NCC(=O)NCC(=O)N[C@@H](CCCN=C(N)N)C(=O)NC2=CC3=CC=CC=C3C=C2\\nThe natural language question is: The molecule is an anionic C3 cyanine-type compound having indoleinine and tetrahydropyrano[2,3-f]pyrido[3,2,1-ij]quinoline substituents at either end. It has a role as a fluorochrome.\\nThe corresponding SMILES representation is:\\nCC1(C2=C(C=CC(=C2)S(=O)(=O)[O-])[N+](=C1/C=C/C=C/3\\\\\\\\C=C(OC4=C3C=C5CCCN6C5=C4CCC6)C(C)(C)C)CCCS(=O)(=O)[O-])CCCC(=O)O\\nThe natural language question is: The molecule is the S-oxide of methimazole. It is a sulfone and a member of 1,3-dihydroimidazole-2-thiones. It derives from a methimazole.\\nThe corresponding SMILES representation is:\\nCN1C=CNC1=S=O\\nThe natural language question is: The molecule is a 1,3-dichloropropene with a (E)-configuration. It has a role as a fumigant. It is a 1,3-dichloropropene and a chloroalkene. It derives from a hydride of a propene.\\nThe corresponding SMILES representation is:\\nC(/C=C/Cl)Cl\\nNext, you will be given a sample for test.The natural language question is: The molecule is a tetrahydroxyflavanone that is (2S)-flavanone substituted by hydroxy groups at positions 5, 7, 3' and 4 and a geranyl group at position 5'. Isolated from Propolis from Okinawa, Japan, it exhibits radical scavenging activity. It has a role as a metabolite and a radical scavenger. It is a tetrahydroxyflavanone and a member of 4'-hydroxyflavanones. It derives from a (2S)-flavanone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=CCC/C(=C/CC1=C(C(=CC(=C1)[C@@H]2CC(=O)C3=C(C=C(C=C3O2)O)O)O)O)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28981',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organophosphate oxoanion that is the dianion of 2'-deoxy-2-hydroxyadenosine 5'-monophosphate arising from deprotonation of both OH groups of the phosphate. It is a conjugate base of a 2-hydroxy-dAMP.\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H](O[C@H]1N2C=NC3=C(NC(=O)N=C32)N)COP(=O)([O-])[O-])O\\nThe natural language question is: The molecule is an organophosphate insecticide, a dialkyl phosphate and a dichlorobenzene. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor and an agrochemical.\\nThe corresponding SMILES representation is:\\nCOP(=O)(OC)O/C(=C/Cl)/C1=C(C=C(C=C1)Cl)Cl\\nThe natural language question is: The molecule is a member of the class of phenols carrying an isopropyl group at position 2. It derives from a hydride of a cumene.\\nThe corresponding SMILES representation is:\\nCC(C)C1=CC=CC=C1O\\nThe natural language question is: The molecule is a neolignan with formula C20H22O5, originally isolated from Piper kadsura. It has a role as a platelet-activating factor receptor antagonist and a plant metabolite. It is a bridged compound, a carbobicyclic compound, a cyclic ketone, an enone, a neolignan and a member of guaiacols.\\nThe corresponding SMILES representation is:\\nC[C@@H]1[C@H]([C@@H]2C(=O)C(=C[C@]1(C2=O)OC)CC=C)C3=CC(=C(C=C3)O)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is an ammonium ion derivative resulting from the protonation of the amino group of 4-{2-[(2-hydroxybenzyl)amino]ethyl}-2,5-dimethoxybenzonitrile. It is a conjugate acid of a 4-{2-[(2-hydroxybenzyl)amino]ethyl}-2,5-dimethoxybenzonitrile.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC(=C(C=C1CC[NH2+]CC2=CC=CC=C2O)OC)C#N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9160',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a penicillin in which the substituent at position 6 of the penam ring is a (2R)-2-amino-2-(cyclohexa-1,4-dien-1-yl)acetamido group. It is a penicillin and a penicillin allergen. It derives from an ampicillin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CCC=CC3)N)C(=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8098',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 7-hydroxyisoflavone that is daidzein substituted by a hydroxy group at position 3'. It has a role as a metabolite, an antineoplastic agent and an EC 1.3.1.22 [3-oxo-5alpha-steroid 4-dehydrogenase (NADP(+))] inhibitor. It derives from a daidzein.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=C(C=C1C2=COC3=C(C2=O)C=CC(=C3)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8385',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of 5-[(E)-caffeoyl]shikimic acid. Major structure at pH 7.3 It has a role as a plant metabolite. It is a cyclohexenecarboxylate and a hydroxy monocarboxylic acid anion. It is a conjugate base of a 5-[(E)-caffeoyl]shikimic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H]([C@@H](C=C1C(=O)[O-])O)O)OC(=O)/C=C/C2=CC(=C(C=C2)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4445',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organic anion that is the conjugate base of 8-amino-8-demethylriboflavin, obtained by removal of the imide proton at position 3. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of an 8-amino-8-demethylriboflavin.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C=C1N)N(C3=NC(=NC(=O)C3=N2)[O-])C[C@@H]([C@@H]([C@@H](CO)O)O)O\\nThe natural language question is: The molecule is a tetracyclic triterpenoid isolated from the leaves of Garcia parviflora. It has a role as a plant metabolite. It is a tetracyclic triterpenoid and a monocarboxylic acid.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)O)[C@]1(CC[C@H]2[C@]([C@@H]1C=C)(CC[C@@]3([C@@]2(CC[C@@]4([C@H]3CC(CC4)(C)C)C)C)C)C)C\\nThe natural language question is: The molecule is a methyl ester that is methyl phenylacetate substituted at the alpha-position by a cyanomethyl group. It is a nitrile and a methyl ester. It derives from a phenylacetic acid.\\nThe corresponding SMILES representation is:\\nCOC(=O)C(CC#N)C1=CC=CC=C1\\nThe natural language question is: The molecule is a trisaccharide that is alpha-D-galactopyranose in which the hydroxy groups at positions 3 and 4 have been converted into the corresponding beta-D-glucopyranosides. It is a trisaccharide and a beta-D-glucoside.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@H]2[C@H](O[C@@H]([C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)O)CO)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a benzenedicarboxylic acid that is isophthalic acid in which the hydrogen at position 2 is substituted by a hydroxy group. It is a hydroxybenzoic acid, a member of phenols and a benzenedicarboxylic acid. It derives from an isophthalic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1)C(=O)O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11974',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organic cation that is the conjugate acid of methymycin, obtained by protonation of the tertiary amino group; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a methymycin.\\nThe corresponding SMILES representation is:\\nCC[C@@H]1[C@@](/C=C/C(=O)[C@@H](C[C@@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2[C@@H]([C@H](C[C@H](O2)C)[NH+](C)C)O)C)C)(C)O\\nThe natural language question is: The molecule is a polyacyl alpha,alpha-trehalose derivative that is 2'-sulfo-alpha,alpha-trehalose carrying palmitoyl and stearoyl groups at positions 2 and 3 respectively. It has a role as a bacterial metabolite. It is a polyacyl alpha,alpha-trehalose derivative, a sulfoglycolipid and a trehalose sulfate. It is a conjugate acid of a 2-palmitoyl-3-stearoyl-2'-sulfo-alpha,alpha-trehalose(1-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCC(=O)O[C@H]1[C@@H]([C@H](O[C@@H]([C@@H]1OC(=O)CCCCCCCCCCCCCCC)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)OS(=O)(=O)O)CO)O\\nThe natural language question is: The molecule is a cholesteryl 6-O-acyl-beta-D-galactoside having hexanoyl as the 6-O-acyl group. It is a cholesteryl 6-O-acyl-beta-D-galactoside and a hexanoate ester.\\nThe corresponding SMILES representation is:\\nCCCCCC(=O)OC[C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O[C@H]2CC[C@@]3([C@H]4CC[C@]5([C@H]([C@@H]4CC=C3C2)CC[C@@H]5[C@H](C)CCCC(C)C)C)C)O)O)O\\nThe natural language question is: The molecule is an organic cation obtained by selective protonation at position 1 of the piperidine ring in NAN 190. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a NAN 190.\\nThe corresponding SMILES representation is:\\nCOC1=CC=CC=C1N2CC[NH+](CC2)CCCCN3C(=O)C4=CC=CC=C4C3=O\\nNext, you will be given a sample for test.The natural language question is: The molecule is penicillanic acid carrying a (p-aminophenyl)imino]pentylidene}amino substituent at the 6beta position. It has been used as a hapten in the production of a generic monoclonal antibody for determining penicillin residues in milk. It has a role as a hapten.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)N=CCCCC=NC3=CC=C(C=C3)N)C(=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12331',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a dimethoxybenzene that is the methyl ether derivative of veratryl alcohol. It derives from a (3,4-dimethoxyphenyl)methanol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COCC1=CC(=C(C=C1)OC)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9357',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a member of the class of tetrazoles that is 1-methyl-4-phenyltetrazole in which the phenyl group has been substituted at positions 2 and 3 by [1-(p-chlorophenyl)-1H-pyrazol-3-yl]oxy}methyl and methyl groups, respectively. A quinone outside inhibitor, it is a fungicide that can be used to control a broad range of diseases, including Septoria leaf blotch in wheat. It has a role as an antifungal agrochemical and a quinone outside inhibitor. It is a member of tetrazoles, a pyrazole pesticide and a member of monochlorobenzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=CC=C1)N2C(=O)N(N=N2)C)COC3=NN(C=C3)C4=CC=C(C=C4)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12680',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of barbiturates, the structure of which is that of barbituric acid substituted at C-5 by two ethyl groups. Formerly used as a hypnotic (sleeping aid). It has a role as a drug allergen.\\nThe corresponding SMILES representation is:\\nCCC1(C(=O)NC(=O)NC1=O)CC\\nThe natural language question is: The molecule is a 3beta-hydroxy steroid resulting from the substitution of the 3beta-hydrogen of tomatidane by a hydroxy group. It is an azaspiro compound, an oxaspiro compound and a 3beta-hydroxy steroid. It derives from a hydride of a tomatidane.\\nThe corresponding SMILES representation is:\\nC[C@H]1CC[C@]2([C@H]([C@H]3[C@@H](O2)C[C@@H]4[C@@]3(CC[C@H]5[C@H]4CC[C@@H]6[C@@]5(CC[C@@H](C6)O)C)C)C)NC1\\nThe natural language question is: The molecule is anionic form of a sulfated menaquinone-type compound arising from deprotonation of the sulfate OH; a metabolite from Mycobacterium tuberculosis lipid extracts.\\nThe corresponding SMILES representation is:\\nCC1=C(C(=O)C2=CC=CC=C2C1=O)C/C=C(\\\\\\\\C)/CCC[C@H](C)CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/OS(=O)(=O)[O-]\\nThe natural language question is: The molecule is an N-substituted diamine that is 1,4-phenylenediamine in which one hydrogen from each amino group is replaced by a phenyl group. It has a role as an antioxidant. It is a secondary amino compound and a N-substituted diamine. It derives from a p-aminodiphenylamine.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)NC2=CC=C(C=C2)NC3=CC=CC=C3\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 4-O-(p-hydroxybenzoyl)ascaroside derived from (8R)-8-hydroxynonanoic acid. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a 4-O-(p-hydroxybenzoyl)ascaroside, an (omega-1)-hydroxy fatty acid ascaroside and a monocarboxylic acid. It derives from an ascr#10 and an (8R)-8-hydroxynonanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCCC(=O)O)O)OC(=O)C2=CC=C(C=C2)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29387',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a phenolate anion obtained by deprotonation of the 2-hydroxy group of norsolorinic acid anthrone. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of a norsolorinic acid anthrone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC(=O)C1=C(C=C2CC3=C(C(=CC(=C3)O)O)C(=O)C2=C1[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9692',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a polyazaalkane that is tetradecane in which the carbons at positions 1, 5, 10 and 14 are replaced by nitrogens. Spermine has broad actions on cellular metabolism. It has a role as an antioxidant, an immunosuppressive agent and a fundamental metabolite. It is a polyazaalkane and a tetramine. It is a conjugate base of a spermine(4+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCNCCCN)CNCCCN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29224',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 3-hydroxycyclohexa-1,5-diene-1-carboxylic acid. It derives from a cyclohexa-1,5-diene-1-carbonyl-CoA. It is a conjugate acid of a 3-hydroxycyclohexa-1,5-diene-1-carbonyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)C4=CC(CC=C4)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25850',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organophosphonate oxoanion obtained by deprotonation of the carboxy and phosphonate groups as well as protonation of the amino group of (1R,2S)-1-(S-L-cysteinyl)-2-hydroxypropylphosphonate; major species at pH 7.3. It is an organophosphonate oxoanion and an alpha-amino-acid anion. It is a conjugate base of a (1R,2S)-1-(S-L-cysteinyl)-2-hydroxypropylphosphonate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]([C@H](P(=O)([O-])[O-])SC[C@@H](C(=O)[O-])[NH3+])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13269',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a broad spectrum, third-generation cephalosporin antibiotic with (Z)-2-(4-methyl-1,3-thiazol-5-yl)ethenyl and (2Z)-2-(2-amino-1,3-thiazol-4-yl)-2-(methoxyimino)acetamido groups at positions 3 and 7, respectively, of the cephem skeleton. Generally administered as its orally absorbed pivaloyloxymethyl ester prodrug, it is used for the treatment of mild to moderate infections caused by susceptible strains of microorganisms in acute bacterial exacerbation of chronic bronchitis, community-acquired pneumonia, pharyngitis/tonsillitis, and uncomplicated skin and skin-structure infections. It has a role as an antibacterial drug. It is a cephalosporin and a carboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(SC=N1)/C=C\\\\\\\\C2=C(N3[C@@H]([C@@H](C3=O)NC(=O)/C(=N\\\\\\\\OC)/C4=CSC(=N4)N)SC2)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14983',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an HETE having a 9-hydroxy group and (5E)-, (7Z)-, (11Z)- and (14Z)-double bonds. It has a role as a metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CC(/C=C\\\\\\\\C=C\\\\\\\\CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5822',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a retinoid that is all-trans-retinal carrying an oxo substituent at position 4 on the cyclohexenyl ring. It has a role as a mouse metabolite. It is an enal, a retinoid and an enone. It derives from an all-trans-retinal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(CCC1=O)(C)C)/C=C/C(=C/C=C/C(=C/C=O)/C)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9632',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a cyclitol ester that is 1D-myo-inositol bearing a indol-3-acetyl substituent at position 1. It is a member of indoles, a cyclitol ester and an indoleacetic acid ester conjugate. It derives from a myo-inositol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=CN2)CC(=O)OC3[C@@H]([C@H](C([C@H]([C@H]3O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29050',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an organic cation obtained by deprotonation of the tertiary amino group of festuclavine; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a festuclavine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C[C@H]2[C@@H](CC3=CNC4=CC=CC2=C34)[NH+](C1)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29578',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a phenolate anion that is the conjugate base of 2,4-diacetylphloroglucinol, obtained by deprotonation of one of the two hydroxy groups at position 1 and 5. Major microspecies at pH 7.3. It has a role as an antifungal agent. It is a conjugate base of a 2,4-diacetylphloroglucinol.\\nThe corresponding SMILES representation is:\\nCC(=O)C1=C(C(=C(C=C1O)O)C(=O)C)[O-]\\nThe natural language question is: The molecule is a member of the class of penicillanic acids that is sulbactam in which one of the exocyclic methyl hydrogens is replaced by a 1,2,3-triazol-1-yl group; used (in the form of its sodium salt) in combination with ceftolozane sulfate for treatment of complicated intra-abdominal infections and complicated urinary tract infections. It has a role as an antimicrobial agent, an antiinfective agent and an EC 3.5.2.6 (beta-lactamase) inhibitor. It is a member of penicillanic acids and a member of triazoles. It derives from a sulbactam. It is a conjugate acid of a tazobactam(1-).\\nThe corresponding SMILES representation is:\\nC[C@@]1([C@@H](N2[C@H](S1(=O)=O)CC2=O)C(=O)O)CN3C=CN=N3\\nThe natural language question is: The molecule is a member of the class of hydroxybiphenyls that is catechol in which the hydrogen at position 3 is replaced by a p-chlorophenyl group. It is a diol, a member of catechols, a member of monochlorobenzenes and a member of hydroxybiphenyls. It derives from a biphenyl-2,3-diol and a 4-chlorobiphenyl.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C(=C1)O)O)C2=CC=C(C=C2)Cl\\nThe natural language question is: The molecule is a galactosamine phosphate that is D-galactosamine substituted at position 1 by a monophosphate group. It has a role as an Escherichia coli metabolite. It derives from a D-galactosamine. It is a conjugate acid of a D-galactosamine 6-phosphate(1-).\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@@H]([C@@H]([C@H](C(O1)O)N)O)O)OP(=O)(O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organic cation obtained by protonation of the secondary amino function of (1S,2R)-ephedrine; major species at pH 7.3. It is an enantiomer of a (-)-ephedrinium.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@H](C1=CC=CC=C1)O)[NH2+]C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7122',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a derivative of vindoline lacking the 11-O-methyl and 17-O-acetyl substituents. It is a vinca alkaloid, a methyl ester, a secondary alcohol and a tertiary alcohol. It derives from a vindoline. It is a conjugate base of an 11-O-demethyl-17-O-deacetylvindolinium(1+).\\nThe corresponding SMILES representation is:\\nCC[C@@]12C=CCN3[C@@H]1[C@]4(CC3)[C@H]([C@]([C@@H]2O)(C(=O)OC)O)N(C5=C4C=CC(=C5)O)C\\nThe natural language question is: The molecule is ferulic acid in which the ring hydrogen at position 5 is substituted by a hydroxy group. It is a hydroxycinnamic acid and a methoxycinnamic acid. It is a conjugate acid of a 5-hydroxyferulate.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1O)O)/C=C/C(=O)O\\nThe natural language question is: The molecule is a L-alpha-amino acid anion obtained by the deprotonation of the carboxy group of hypusine. It is a conjugate base of a hypusine.\\nThe corresponding SMILES representation is:\\nC(CCNC[C@@H](CCN)O)C[C@@H](C(=O)[O-])N\\nThe natural language question is: The molecule is a cardiolipin in which the phosphatidyl acyl groups at positions 1, 1' and 2 are specified as linoleoyl, while that at position 2' is specified as palmitoyl. It derives from a linoleic acid and a hexadecanoic acid. It is a conjugate acid of a 1,1',2-trilinoleoyl-2'-palmitoyl cardiolipin(2-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)COP(=O)(O)OCC(COP(=O)(O)OC[C@@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is conjugate base of 20-hydroxy-leukotriene B4 arising from deprotonation of the carboxylic acid function. It has a role as a human metabolite. It is a conjugate base of a 20-hydroxy-leukotriene B4.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C(CC/C=C\\\\\\\\C[C@H](/C=C/C=C/C=C\\\\\\\\[C@H](CCCC(=O)[O-])O)O)CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16535',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an omega-hydroxy fatty acid that is undecanoic acid in which one of the hydrogens of the terminal methyl group is replaced by a hydroxy group. It is an omega-hydroxy fatty acid, a medium-chain fatty acid and a straight-chain fatty acid. It derives from an undecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCO)CCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22058',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a hydroxamic acid that is N-hydroxy-3-methylbenzamide in which the the hydrogens at positions 2 and 5 have been replaced by benzyl[(4-methoxyphenyl)sulfonyl]amino and (diethylamino)methyl groups, respectively. It is a cell-permeable, potent, selective, and reversible inhibitor of matrix metallopeptidase-9 (MMP-9, EC 3.4.24.35). It has a role as an EC 3.4.24.35 (gelatinase B) inhibitor. It is a hydroxamic acid, a tertiary amino compound, a sulfonamide and an aromatic ether.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN(CC)CC1=CC(=C(C(=C1)C)N(CC2=CC=CC=C2)S(=O)(=O)C3=CC=C(C=C3)OC)C(=O)NO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_121',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a xyloside that is beta-D-xylopyranose in which the anomeric hydroxy hydrogen is replaced by a 4-nitrophenyl group. It has a role as a chromogenic compound. It is a xyloside and a C-nitro compound. It derives from a 4-nitrophenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H]([C@H]([C@@H](O1)OC2=CC=C(C=C2)[N+](=O)[O-])O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5807',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a 2-pyranone in which the hydrogens at positions 4 and 6 of 2H-pyran-2-one are replaced by hydroxy and undecyl groups respectively. It is a member of 2-pyranones and a heteroaryl hydroxy compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCC1=CC(=CC(=O)O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13800',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a pyrimidine ribonucleoside 5'-monophosphate that is the 5-carboxymethylaminomethyl-2'-O-methyl derivative of uridine 5'-monophosphate. It is a glycine derivative and a pyrimidine ribonucleoside 5'-monophosphate. It derives from a uridine 5'-monophosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CO[C@@H]1[C@@H]([C@H](O[C@H]1N2C=C(C(=O)NC2=O)CNCC(=O)O)COP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18136',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of the class of chalcones that is chalcone substituted by hydroxy groups at positions 4, 2' and 4', a geranyl group at position 3' and a 6,6-dimethyl-3,6-dihydro-2H-pyran ring fused across positions 5' and 6'. Isolated from the fruits of Mallotus philippensis, it exhibits anti-inflammatory and immunoregulatory activities. It has a role as a metabolite, an anti-inflammatory agent, an EC 1.14.13.39 (nitric oxide synthase) inhibitor and a cyclooxygenase 2 inhibitor. It is a member of chalcones, a chromenol and a polyphenol.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=CCC/C(=C/CC1=C(C(=C2C(=C1O)C=CC(O2)(C)C)C(=O)/C=C/C3=CC=C(C=C3)O)O)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2048',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a metal chloride salt with a K(+) counterion. It has a role as a fertilizer. It is a potassium salt and an inorganic chloride.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[Cl-].[K+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27515',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a withanolide saponin that consists of 3-hydroxy-22,26-epoxyergosta-5,24-diene substituted by additonal hydroxy groups at positions 19 and 27, oxo groups at positions 1 and 26 and a beta-D-glucopyranosyl residue at position 3 via a glycodic linkage. It has been isolated from Physalis longifolia. It has a role as a metabolite and a plant metabolite. It is a withanolide saponin, a 19-hydroxy steroid, a 27-hydroxy steroid, a delta-lactone, a beta-D-glucoside, a monosaccharide derivative and an ergostanoid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=O)O[C@H](C1)[C@@H](C)[C@H]2CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3CC=C5[C@@]4(C(=O)C[C@@H](C5)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)CO)C)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17486',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a long-chain fatty acyl-CoA(4-) oxanion arising from deprotonation of the phosphate and diphosphate OH groups of 2-methylhexadecanoyl-CoA; major species at pH 7.3 It is a long-chain fatty acyl-CoA(4-) and a saturated fatty acyl-CoA(4-). It is a conjugate base of a 2-methylhexadecanoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCC(C)C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7135',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a tripeptide composed of two L-alanyl and an L-aspartic acid residue joined in sequence. It has a role as a metabolite. It derives from a L-alanine and a L-aspartic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)O)C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5590',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 6-oxo monocarboxylic acid anion that is the conjugate base of 3-isopropenyl-6-oxoheptanoic acid; major species at pH 7.3. It derives from a heptanoate. It is a conjugate base of a 3-isopropenyl-6-oxoheptanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=C)C(CCC(=O)C)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22729',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a cyclic hydroxamic acid that is DIMBOA attached to a beta-D-glucopyranosyl residue at position 2 via a glycosidic linkage. It has a role as a plant metabolite. It is a cyclic hydroxamic acid, a benzoxazine and a beta-D-glucoside. It derives from a DIMBOA.\\nThe corresponding SMILES representation is:\\nCOC1=CC2=C(C=C1)N(C(=O)C(O2)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O\\nThe natural language question is: The molecule is a butan-4-olide having a 2-chloroethyl group at the 3-position and two methyl substituents at the 5-position. It is an organochlorine compound and a butan-4-olide.\\nThe corresponding SMILES representation is:\\nCC1(CC(C(=O)O1)CCCl)C\\nThe natural language question is: The molecule is a methyl ketone that consists of propane bearing an oxo group at C2. It has a role as a polar aprotic solvent, a human metabolite and an EC 3.5.1.4 (amidase) inhibitor. It is a methyl ketone, a ketone body, a volatile organic compound and a member of propanones.\\nThe corresponding SMILES representation is:\\nCC(=O)C\\nThe natural language question is: The molecule is a member of the class of resolvins that is (6E,8E,10Z,13Z,15E,19Z)-docosahexaenoic acid carrying three hydroxy substituents at positions 4, 5 and 17 (the 4S,5R,17R-stereoisomer). It has a role as an anti-inflammatory agent, a human xenobiotic metabolite and a mouse metabolite. It is a resolvin, a secondary allylic alcohol, a triol and a hydroxy polyunsaturated fatty acid.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C[C@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\[C@H]([C@H](CCC(=O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a triterpenoid saponin isolated from the roots of of the Madagascan plant Albizia gummifera and has been shown to exhibit cytotoxicity against human ovarian cancer cell line. It has a role as an antineoplastic agent and a plant metabolite. It is an enoate ester, a pentacyclic triterpenoid and a triterpenoid saponin. It derives from a hydride of an oleanane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@](C)(CC/C=C(\\\\\\\\C)/C(=O)O[C@@H]2[C@H](O[C@H]([C@@H]([C@H]2O)O)O[C@@](C)(CC/C=C(\\\\\\\\C)/C(=O)O[C@H]3C[C@@]4([C@@H](C[C@@]5(C(=CC[C@H]6[C@]5(CC[C@@H]7[C@@]6(CC[C@@H](C7(C)C)O[C@H]8[C@@H]([C@H]([C@@H]([C@H](O8)CO[C@H]9[C@@H]([C@H]([C@H]([C@H](O9)C)O)O)O[C@H]1[C@@H]([C@H]([C@H](CO1)O)O)O)O)O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)C)C)[C@@H]4CC3(C)C)C)O)C(=O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)C)O[C@H]1[C@@H]([C@H]([C@@H](CO1)O)O)O)O)O)C=C)C)C=C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21942',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 1,2-diacyl-sn-glycerol in which the 1- and 2-acyl groups are specified as palmitoleoyl. It has a role as a mouse metabolite. It derives from a palmitoleic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](CO)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25817',\n", + " 'prompt': \"Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a pyrimidine ribonucleoside 5'-tetraphosphate compound having 5'-uridinyl residues at the P(1)- and P(4)-positions. It has a role as a P2Y2 receptor agonist and a mouse metabolite. It is a pyrimidine ribonucleoside 5'-tetraphosphate and a uridine 5'-phosphate.\\nThe corresponding SMILES representation is:\\nC1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=CC(=O)NC4=O)O)O)O)O\\nThe natural language question is: The molecule is an acyl-CoA(4-) oxoanion arising from deprotonation of the phosphate and diphosphate OH groups of 3-(m-hydroxyphenyl)propanoyl-CoA; major species at pH 7.3. It is a conjugate base of a 3-(m-hydroxyphenyl)propanoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CCC4=CC(=CC=C4)O)O\\nThe natural language question is: The molecule is the aromatic diazonium ion that is diazotised 3-aminobenzenesulfonic acid. It has a role as a hapten. It derives from a benzenesulfonate.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC(=C1)S(=O)(=O)[O-])[N+]#N\\nThe natural language question is: The molecule is a hydroxy monocarboxylic acid anion resulting from the deprotonation of the carboxy group of (3R)-3-{[(3R)-3-{[(3R)-3-{[(3R)-3-hydroxybutanoyl]oxy}butanoyl]oxy}butanoyl]oxy}butanoic acid. A tetramer of (3R)-hydroxybutanoate; the major microspecies at pH 7.3. It derives from a (R)-3-hydroxybutyrate. It is a conjugate base of a (3R)-3-{[(3R)-3-{[(3R)-3-{[(3R)-3-hydroxybutanoyl]oxy}butanoyl]oxy}butanoyl]oxy}butanoic acid.\\nThe corresponding SMILES representation is:\\nC[C@H](CC(=O)O[C@H](C)CC(=O)O[C@H](C)CC(=O)O[C@H](C)CC(=O)[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an azo dye with a structure consisting of nitrobenzene substituted on the 4-position of the phenyl group with a 4-[N-ethyl-N-(2-hydroxyethyl)]phenylazo group. It has a role as a dye and an allergen. It is a monoazo compound and a member of azobenzenes. It derives from an azobenzene.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCN(CCO)C1=CC=C(C=C1)N=NC2=CC=C(C=C2)[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10879',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a ruthenium coordination entity that acts as a water-soluble carbon monoxide-releasing molecule. It has a role as a nephroprotective agent, an antibacterial agent, an anti-inflammatory agent, an anticoagulant, an EC 1.11.2.2 (myeloperoxidase) inhibitor, a neuroprotective agent and a mitochondrial respiratory-chain inhibitor. It is a metal carbonyl and a ruthenium coordination entity.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C-]#[O+].[C-]#[O+].[C-]#[O+].C(C(=O)O)N.Cl[Ru]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5096',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an organophosphate oxoanion that is the conjugate base of guanylyl-(3'->5')-cytidine, obtained by deprotonation of the phosphate group. It is a conjugate base of a guanylyl-(3'->5')-cytidine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CN(C(=O)N=C1N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])O[C@@H]3[C@H](O[C@H]([C@@H]3O)N4C=NC5=C4N=C(NC5=O)N)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29433',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is biphenyl substituted with nitro groups at the 2- and 2'-positions. It is a member of biphenyls and a C-nitro compound.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC=C(C(=C1)C2=CC=CC=C2[N+](=O)[O-])[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27344',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a non-proteinogenic amino acid derivative that is methyl alaninate substituted by a tert-butoxycarbonyl group at the N and a 2-(2,6-dichlorophenyl)-4-(phenylsulfanyl)-1,2,3,4,4a,8a-hexahydro-6-quinolyl group at position 3. It is a member of quinolines, a dichlorobenzene, a methyl ester, a carbamate ester, an organic sulfide and a non-proteinogenic amino acid derivative. It contains a phenylsulfanyl group. It derives from a tert-butanol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C)OC(=O)NC(CC1=CC2C(CC(NC2C=C1)C3=C(C=CC=C3Cl)Cl)SC4=CC=CC=C4)C(=O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28181',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a very long-chain omega-3 fatty acid that is octatriacontanoic acid having six double bonds located at positions 23, 26, 29, 32 and 35 (the 23Z,26Z,29Z,32Z,35Z-isomer). It is an omega-3 fatty acid and an octatriacontapentaenoic acid. It is a conjugate acid of a (23Z,26Z,29Z,32Z,35Z)-octatriacontapentaenoate.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCCCCCC(=O)O\\nThe natural language question is: The molecule is a phosphatidic acid in which the phosphatidyl acyl groups are both myristoyl. It is a conjugate acid of a 1,2-dimyrsitoylphosphatidate(2-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC(=O)OCC(COP(=O)(O)O)OC(=O)CCCCCCCCCCCCC\\nThe natural language question is: The molecule is an oxoicosatetraenoic acid that is (5Z,8Z,12E,14Z)-icosatetraenoic acid bearing a single oxo substituent located at position 12. It has a role as a human metabolite. It is an oxoicosatetraenoic acid and an enone. It derives from an icosa-5,8,12,14-tetraenoic acid. It is a conjugate acid of an 11-oxo-ETE(1-).\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C=C\\\\\\\\C(=O)C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O\\nThe natural language question is: The molecule is a steroid glycoside isolated from the roots of Cynanchum auriculatum and has been shown to exhibit cytotoxicity against human tumour cell lines. It has a role as a metabolite and an antineoplastic agent. It is a 17beta-hydroxy steroid, a cinnamate ester, a steroid ester, a deoxy oligosaccharide derivative, a methyl ketone, a steroid saponin and a tertiary alpha-hydroxy ketone. It derives from a hydride of a pregnane.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@@H]2[C@H](O[C@H](C[C@@H]2OC)O[C@H]3CC[C@@]4([C@H]5C[C@H]([C@@]6([C@@](CC[C@@]6([C@@]5(CC=C4C3)O)O)(C(=O)C)O)C)OC(=O)/C=C/C7=CC=CC=C7)C)C)OC)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a diester obtained by the formal condensation of 2,4,4-trimethylpentane-1,3-diol with two molecules of 2-methylpropanoic acid. Metabolite observed in cancer metabolism. It has a role as a human metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C(=O)OCC(C)C(C(C)(C)C)OC(=O)C(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29212',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the carbohydrate acid derivative anion formed by proton loss from the free carboxy group of alpha-Kdo1Me-(2->8)-alpha-Kdo-OAll It is a conjugate base of an alpha-Kdo1Me-(2->8)-alpha-Kdo-OAll.\\nThe corresponding SMILES representation is:\\nCOC(=O)[C@]1(C[C@H]([C@H]([C@H](O1)[C@@H](CO)O)O)O)OC[C@H]([C@@H]2[C@@H]([C@@H](C[C@@](O2)(C(=O)[O-])OCC=C)O)O)O\\nThe natural language question is: The molecule is a leukotriene that is the 6-trans,12S-isomer of leukotriene B4. It is a dihydroxy monocarboxylic acid, a leukotriene, a long-chain fatty acid and a hydroxy polyunsaturated fatty acid. It derives from an icosa-6,8,10,14-tetraenoic acid. It is a conjugate acid of a Delta(6)-trans-12-epi-leukotriene B4(1-).\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C[C@@H](/C=C/C=C/C=C/[C@H](CCCC(=O)O)O)O\\nThe natural language question is: The molecule is a cationic sphingoid that is the conjugate acid of 15-methylhexadecasphing-4-enine, obtained by protonation of the amino group; major species at pH 7.3. It is a conjugate acid of a 15-methylhexadecasphing-4-enine.\\nThe corresponding SMILES representation is:\\nCC(C)CCCCCCCCC/C=C/[C@H]([C@H](CO)[NH3+])O\\nThe natural language question is: The molecule is a monomethoxyflavone that is the 3'-O-methyl derivative of myricetin. It has a role as a metabolite. It is a pentahydroxyflavone, a monomethoxyflavone, a member of 3'-methoxyflavones and a 5'-hydroxy-3'-methoxyflavone. It derives from a myricetin. It is a conjugate acid of a laricitrin(1-).\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1O)O)C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is dianion of undecaprenyl phosphate arising from deprotonation of the phosphate OH groups; major species at pH 7.3. It is a conjugate base of an undecaprenyl dihydrogen phosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/COP(=O)([O-])[O-])/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21762',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organic heterotetracyclic compound that is 9,9a-dihydro-9,4a-prop[1]enoxanthene-1,4-dione substituted by a hydroxy group at position 7, a methyl group at position 12 and a prenyl group at position 9a. Isolated from the root barks of Ehretia buxifolia, it exhibits antisnake venom activity. It has a role as a metabolite and an antidote. It is an organic heterotetracyclic compound, a cyclic ketone, a cyclic ether and a member of phenols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C[C@H]2C3=C(C=CC(=C3)O)O[C@@]4(C1)[C@@]2(C(=O)C=CC4=O)CC=C(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6059',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a phenol substituted at position 4 by a 2-hydroxyethyl group. It has a role as an anti-arrhythmia drug, an antioxidant, a cardiovascular drug, a protective agent and a fungal metabolite. It derives from a 2-phenylethanol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1CCO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24012',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a nucleotide-sugar oxoanion arising from deprotonation of the diphosphate OH groups of GDP-L-galactose; major species at pH 7.3. It is a conjugate base of a GDP-L-galactose.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])OC4[C@H]([C@@H]([C@@H]([C@@H](O4)CO)O)O)O)O)O)N=C(NC2=O)N\\nThe natural language question is: The molecule is an organic anion obtained by removal of one of the methylene protons from barbituric acid. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of a barbituric acid.\\nThe corresponding SMILES representation is:\\nC1=C(NC(=O)NC1=O)[O-]\\nThe natural language question is: The molecule is a disaccharide that is D-mannopyranose in which the hydroxy group at position 4 has been converted into the corresponding beta-D-galactopyranosyl derivative. It derives from a beta-D-mannose and a beta-D-galactose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O[C@@H]2[C@H](O[C@H]([C@H]([C@H]2O)O)O)CO)O)O)O)O\\nThe natural language question is: The molecule is an octadecenoyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (11E)-octadecenoic acid. It derives from a trans-vaccenic acid. It is a conjugate acid of an (11E)-octadecenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCCC/C=C/CCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a pyrazolooxadiazepine that is 7-oxo-1,2,4,5-tetrahydro-7H-pyrazolo[1,2-d][1,4,5]oxadiazepin which is substituted at positions 8 and 9 by 2,6-diethyl-4-methylphenyl and pivaloyloxy groups, respectively. A pro-herbicide (by hydrolysis of the pivalate ester to give the corresponding enol), it is used for control of grass weeds in cereal crops. It has a role as a xenobiotic, an environmental contaminant, an agrochemical, an EC 6.4.1.2 (acetyl-CoA carboxylase) inhibitor and a proherbicide. It is a pivalate ester and a pyrazolooxadiazepine. It derives from a pinoxaden acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=CC(=CC(=C1C2=C(N3CCOCCN3C2=O)OC(=O)C(C)(C)C)CC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22055',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an iminium betaine that is 5-methylphenazin-5-ium which is substituted at position 1 by an oxidanidyl group. An antibiotic pigment produced by Pseudomonas aeruginosa. It has a role as an antibacterial agent, a biological pigment, a bacterial metabolite and a virulence factor. It is a member of phenazines and an iminium betaine. It is a conjugate base of a pyocyanine(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1C2=C(C(=CC=C2)O)NC3=CC=CC=C31'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28871',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of depsidones that is 3,4-dihydro-2H,7H-chromeno[7,6-b][1,4]benzodioxepine substituted by a chloro group at position 9, a hydroxy group at position 10, methyl groups at positions 2, 2, 5 and 8, a formyl group at position 11 and oxo groups at positions 4 and 7. Isolated from Chaetomium brasiliense it exhibits antimalarial and cytotoxic activities. It has a role as an antimalarial, an antineoplastic agent and a Chaetomium metabolite. It is an aldehyde, a member of depsidones, a member of phenols, an organic heterotetracyclic compound and an organochlorine compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C2C(=O)CC(OC2=CC3=C1OC(=O)C4=C(C(=C(C(=C4O3)C=O)O)Cl)C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15340',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxycalciol that is a synthetic fluorinated and deuterated analogue of vitamin D3 which exhibits vitamin D receptor superagonist and anti-cancer activity. It has a role as a vitamin D receptor agonist and an antineoplastic agent. It is a member of D3 vitamins, a hydroxycalciol, a deuterated compound, an organofluorine compound and a tetrol.\\nThe corresponding SMILES representation is:\\n[2H]C([2H])([2H])C(CCC[C@@H](CC#CC(C(F)(F)F)(C(F)(F)F)O)[C@H]1CC[C@@H]\\\\\\\\2[C@@]1(CCC/C2=C\\\\\\\\C=C/3\\\\\\\\C[C@H](C[C@@H](C3=C)O)O)C)(C([2H])([2H])[2H])O\\nThe natural language question is: The molecule is an amino trisaccharide consisting of alpha-L-rhamnose at the reducing end having an alpha-L-rhamnosyl-(1->3)-N-acetyl-beta-D-glucosaminyl moiety attached at the 2-position. It is an amino trisaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O[C@H]3[C@@H]([C@@H]([C@H]([C@@H](O3)C)O)O)O)NC(=O)C)O)O\\nThe natural language question is: The molecule is a galactosamine phosphate that is D-galactosamine substituted at position 1 by a monophosphate group. It derives from a D-galactosamine.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@@H]([C@@H]([C@H](C(O1)OP(=O)(O)O)N)O)O)O\\nThe natural language question is: The molecule is a steroidal acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (24E)-3alpha,7alpha-dihydroxy-5beta-cholest-24-en-26-oic acid. It has a role as a human metabolite. It is a conjugate acid of a (24E)-3alpha,7alpha-dihydroxy-5beta-cholest-24-en-26-oyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nC[C@H](CC/C=C(\\\\\\\\C)/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)[C@H]4CC[C@@H]5[C@@]4(CC[C@H]6[C@H]5[C@@H](C[C@H]7[C@@]6(CC[C@H](C7)O)C)O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a sodium salt of the conjugate of any bile acid with either glycine or taurine. It is a cholanoid and an organic sodium salt.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCC(=O)O)C1CCC2C1([C@H](CC3C2[C@@H](C[C@H]4C3(CC[C@H](C4)O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22178',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an organic heteropentacyclic compound that is a mycotoxic indole alkaloid produced by several fungi via a tryptophan-proline diketopiperazine intermediate. It has a role as a mycotoxin. It is an organic heteropentacyclic compound and an indole alkaloid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCN1C2=C(C=CC(=C2)OC)C3=C1[C@@H](N4C(=O)[C@@H]5CCCN5C(=O)[C@@]4([C@H]3O)O)C=C(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22481',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a carboxamidinium ion resulting from the protonation of both of the amidino groups of diminazene. The major species at pH 7.3. It has a role as an antiparasitic agent and a trypanocidal drug. It is a conjugate acid of a diminazene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(=[NH2+])N)NN=NC2=CC=C(C=C2)C(=[NH2+])N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15539',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a naphthyridine in which the nitrogens are at positions 1 and 4. It is a mancude organic heterobicyclic parent, an ortho-fused heteroarene and a naphthyridine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)N=CC=N2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14786',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a carbamate ester that is phenylcarbamic acid in which the hydrogen of the hydroxy group has been replaced by a 3-[(ethoxycarbonyl)amino]phenyl group. It is an agrochemical used as a herbicide. It has a role as a xenobiotic, an environmental contaminant, a herbicide and an agrochemical. It derives from a phenylcarbamic acid.\\nThe corresponding SMILES representation is:\\nCCOC(=O)NC1=CC(=CC=C1)OC(=O)NC2=CC=CC=C2\\nThe natural language question is: The molecule is a nucleoside 5'-monophosphate(2-) that results from the removal of two protons from the phosphate group of N(6)-methyl-AMP. It derives from an adenosine 5'-monophosphate(2-). It is a conjugate base of a N(6)-methyl-AMP.\\nThe corresponding SMILES representation is:\\nCNC1=C2C(=NC=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])[O-])O)O\\nThe natural language question is: The molecule is an organoarsonic acid salt and an organic sodium salt. It has a role as an antisyphilitic drug. It contains an arsanilate(1-).\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1N)[As](=O)(O)[O-].[Na+]\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of tolmetin, obtained by deprotonation of the carboxy group. It is a conjugate base of a tolmetin.\\nThe corresponding SMILES representation is:\\nCC1=CC=C(C=C1)C(=O)C2=CC=C(N2C)CC(=O)[O-]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside derived from (2E)-9-hydroxynon-2-enoic acid. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is an alpha,beta-unsaturated monocarboxylic acid, a 4-O-(1H-indol-3-ylcarbonyl)ascaroside and an omega-hydroxy fatty acid ascaroside. It derives from a (2E)-9-hydroxynon-2-enoic acid and an oscr#3.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCC/C=C/C(=O)O)O)OC(=O)C2=CNC3=CC=CC=C32'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20817',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a gallate ester obtained by the formal condensation of gallic acid with methanol. It exhibits anti-oxidant, anti-tumor, anti-microbial and anti-inflammatory properties. It has a role as a plant metabolite, an anti-inflammatory agent and an antioxidant.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC(=O)C1=CC(=C(C(=C1)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24227',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a monoalkyl phosphate that is ethyl phosphate in which one of the methyl hydrogens has been replaced by an isothiocyanato group. It is an isothiocyanate and a monoalkyl phosphate. It derives from an ethyl dihydrogen phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(COP(=O)(O)O)N=C=S'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7678',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the dianion of Alexa Fluor 488 para-isomer. It has a role as a fluorochrome. It is an organosulfonate oxoanion and a xanthene dye.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C=C1C(=O)O)C2=C3C=CC(=N)C(=C3OC4=C2C=CC(=C4S(=O)(=O)[O-])N)S(=O)(=O)[O-])C(=O)O\\nThe natural language question is: The molecule is a naphthofuran that is 2,3-dihydronaphtho[1,2-b]furan-4,5-dione substitutd by a hydroxy group at position 6 and a methyl group at position 2. It is isolated from Norcardia sp.TC-A0248 and acts as a protein tyrosine phosphatase inhibitor. It has a role as a metabolite, an EC 3.1.3.48 (protein-tyrosine-phosphatase) inhibitor, an antifungal agent and an antimicrobial agent. It is a naphthofuran, a member of phenols and a member of orthoquinones.\\nThe corresponding SMILES representation is:\\nC[C@H]1CC2=C(O1)C3=C(C(=CC=C3)O)C(=O)C2=O\\nThe natural language question is: The molecule is an organic heterobicyclic compound, which is imidazo[4,5-d]azepin-5(6H)-one substituted by a 3,5-dibromo-4-methoxybenzyl group at position 4, a methyl group at position 6 and a methylamino group at position 2. It is an antimitotic alkaloid isolated from the marine sponge Pseudoceratina. It has a role as a metabolite and an antimitotic. It is an organobromine compound, an alkaloid, an organic heterobicyclic compound, an aromatic ether, a cyclic ketone, a tertiary amine and a secondary amino compound.\\nThe corresponding SMILES representation is:\\nCNC1=NC2=C(C(=O)N(C=CC2=N1)C)CC3=CC(=C(C(=C3)Br)OC)Br\\nThe natural language question is: The molecule is an azo dye with a structure consisting of acetanilide substituted on the 4-position of the phenyl group with a 6-hydroxy-m-tolylazo group. It has a role as a dye and an allergen. It is a monocarboxylic acid amide and a member of azobenzenes. It derives from an azobenzene.\\nThe corresponding SMILES representation is:\\nCC1=CC(=C(C=C1)O)N=NC2=CC=C(C=C2)NC(=O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydrochloride obtained by combining methoctramine with four molar equivalents of hydrochloric acid. It has a role as a muscarinic antagonist. It contains a methoctramine(4+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC=CC=C1CNCCCCCCNCCCCCCCCNCCCCCCNCC2=CC=CC=C2OC.Cl.Cl.Cl.Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18931',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a long-chain fatty acid ethyl ester resulting from the formal condensation of the carboxy group of (5Z,8Z,11Z,14Z,17Z)-icosapentaenoic acid with the hydroxy group of ethanol. It has a role as an anticholesteremic drug, a marine metabolite, an antipsychotic agent, an antidepressant and a prodrug. It is a long-chain fatty acid ethyl ester and a polyunsaturated fatty ester. It derives from an all-cis-5,8,11,14,17-icosapentaenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)OCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16185',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a polyunsaturated fatty acid anion that is the conjugate base of (8S)-hydroperoxy-(14S,15R)-epoxy-(5Z,9E,11Z)-icosatrienoate, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an (8S)-hydroperoxy-(14S,15R)-epoxy-(5Z,9E,11Z)-icosatrienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@@H]1[C@@H](O1)C/C=C\\\\\\\\C=C\\\\\\\\[C@H](C/C=C\\\\\\\\CCCC(=O)[O-])OO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29148',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the penta-anion resulting from the removal of a proton from each of the carboxylic acid groups of coenzyme F430. The major species at pH 7.3. It has a role as a cofactor. It is a conjugate base of a coenzyme F430.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CC(=O)N[C@@]13C[C@H]4[C@H]([C@](C(=N4)C[C@@H]5[C@H]([C@@H]6CCC(=O)/C(=C/7\\\\\\\\[C@H]([C@@H](/C(=C/C(=N3)[C@H]2CCC(=O)[O-])/[N-]7)CC(=O)[O-])CCC(=O)[O-])/C6=N5)CC(=O)[O-])(C)CC(=O)N)CCC(=O)[O-].[Ni]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7693',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a cinnamate ester obtained by formal condensation of the carboxy group of sinapic acid with the 4-hydroxy group of (-)-quinic acid. It is a cinnamate ester and a quinic acid. It derives from a (-)-quinic acid and a trans-sinapic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=CC(=C1O)OC)/C=C/C(=O)OC2[C@@H](CC(C[C@H]2O)(C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_72',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a L-arginine derivative with a N(omega)-methyl substituent. It is a member of guanidines, a non-proteinogenic L-alpha-amino acid and a L-arginine derivative. It is a conjugate acid of a N(omega)-methyl-L-argininate. It is a tautomer of a N(omega)-methyl-L-arginine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN=C(N)NCCC[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_342',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an organic heterotetracyclic compound that is 4'-demethylpodophyllotoxin which is substituted by a hydroxy group at position 10 but which is lacking the hydroxy group at position 9. It is found as a glucoside in the rhizomes of Podophyllum peltatum. It has a role as a metabolite and an antineoplastic agent. It is a furonaphthodioxole, a gamma-lactone, an organic heterotetracyclic compound, a lignan and a member of phenols.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC(=CC(=C1O)OC)[C@H]2[C@@H]3[C@@H](CC4=C(C5=C(C=C24)OCO5)O)COC3=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21115',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of anthocyanin chlorides that has cyanidin 3-O-rutinoside as the cationic counterpart. It contains a cyanidin 3-O-rutinoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=CC4=C(C=C(C=C4[O+]=C3C5=CC(=C(C=C5)O)O)O)O)O)O)O)O)O)O.[Cl-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28161',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a sesquiterpenoid that is 1,2,3a,4,5,7,8,9,9a,9b-decahydronaphtho[2,1-b]furan substituted by methyl groups at positions 1, 9 and 9a and hydroxy groups at positions 2 and 7. Isolated from methylene chloride solubles of the Formosan soft coral Nephthea elongata, it exhibits cytotoxicity against selected cancer cells. It has a role as an antineoplastic agent and a coral metabolite. It is a sesquiterpenoid, a cyclic ether, a diol and an organic heterotricyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C[C@@H](C=C2[C@@]1([C@H]3[C@@H]([C@H](O[C@H]3CC2)O)C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27109',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a limonoid that is the 30-acetyl derivative of trichagmalin F. It has been isolated from Trichilia connaroides. It has a role as a plant metabolite. It is a delta-lactone, a bridged compound, a member of furans, a limonoid, an organic heteropentacyclic compound and a methyl ester. It derives from a trichagmalin F, a 2-hydroxyisobutyric acid and a tiglic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C=C(\\\\\\\\C)/C(=O)O[C@H]1[C@]2(C[C@@]3([C@]1([C@H](C4=C5[C@H](C(=O)O[C@H]([C@@]5(CC[C@@H]4[C@@]3([C@H]2CC(=O)OC)C)C)C6=COC=C6)OC(=O)C(C)(C)O)OC(=O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28032',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a monocarboxylic acid amide obtained by the formal condensation of cyclohexanecarboxylic acid with the amino group of 5-(naphthalen-1-ylmethyl)-1,3-thiazol-2-amine. It has been shown to exhibit antifungal activity. It has a role as an antifungal agent. It is a member of 1,3-thiazoles, a member of naphthalenes and a monocarboxylic acid amide. It derives from a cyclohexanecarboxylic acid.\\nThe corresponding SMILES representation is:\\nC1CCC(CC1)C(=O)NC2=NC=C(S2)CC3=CC=CC4=CC=CC=C43\\nThe natural language question is: The molecule is a 3beta-sterol consisting of an ergostane skeleton with double bonds at 7- and 22-positions. It has a role as a metabolite, an anti-HSV-1 agent, an EC 3.2.1.18 (exo-alpha-sialidase) inhibitor and an antifungal agent. It derives from a hydride of a 5alpha-ergostane.\\nThe corresponding SMILES representation is:\\nC[C@H](/C=C/[C@H](C)C(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(CC[C@@H](C4)O)C)C\\nThe natural language question is: The molecule is a N-acylglycinate that is the conjugate base of glycolithocholic acid; major species at pH 7.3. It has a role as a human metabolite. It is a conjugate base of a glycolithocholic acid.\\nThe corresponding SMILES representation is:\\nC[C@H](CCC(=O)NCC(=O)[O-])[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC[C@H]4[C@@]3(CC[C@H](C4)O)C)C\\nThe natural language question is: The molecule is a member of the class of 1-benzothiophenes that is raloxifene in which the piperidin-1-yl group has been replaced by a pyrrolidin-1-yl group. It has a role as an estrogen receptor antagonist, an estrogen receptor modulator and a bone density conservation agent. It is a member of 1-benzothiophenes, a member of phenols, an aromatic ketone and a N-alkylpyrrolidine.\\nThe corresponding SMILES representation is:\\nC1CCN(C1)CCOC2=CC=C(C=C2)C(=O)C3=C(SC4=C3C=CC(=C4)O)C5=CC=C(C=C5)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an amino acid zwitterion. It is a conjugate base of a cysteinium. It is a conjugate acid of a cysteinate(1-). It is a tautomer of a cysteine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(C(=O)[O-])[NH3+])S'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16197',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a sesquiterpene lactone isolated from the leaves of Eremophila mitchellii. It has a role as a plant metabolite. It is a sesquiterpene lactone, an organic heterotetracyclic compound and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@H]2CC[C@@]([C@H]3[C@@H]2[C@H]4[C@@H]1OC(=O)C4=CC3)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16667',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a steroid acid that is ergosta-8,24(28)-dien-26-oic acid substituted by a methyl group at position 4 and oxo groups at positions 3, 7 and 11 (the 4alpha,5alpha stereoisomer). Isolated from Antrodia cinnamomea and Antrodia camphorata, it exhibits cytotoxic and anti-inflammatory activity. It has a role as an antineoplastic agent, an anti-inflammatory agent and a plant metabolite. It is a 3-oxo steroid, a 7-oxo steroid, an 11-oxo steroid, a steroid acid, a monocarboxylic acid and an ergostanoid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]2CC(=O)C3=C([C@]2(CCC1=O)C)C(=O)C[C@]4([C@H]3CC[C@@H]4[C@H](C)CCC(=C)C(C)C(=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10947',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a fluorinated steroid, a glucocorticoid, an 11beta-hydroxy steroid, a 17alpha-hydroxy steroid, a 21-hydroxy steroid, a 20-oxo steroid, a 3-oxo-Delta(1),Delta(4)-steroid, a primary alpha-hydroxy ketone and a tertiary alpha-hydroxy ketone. It has a role as an anti-inflammatory drug. It derives from a hydride of a pregnane.\\nThe corresponding SMILES representation is:\\nC[C@@H]1C[C@H]2[C@@H]3C[C@@H](C4=CC(=O)C=C[C@@]4([C@]3([C@H](C[C@@]2([C@]1(C(=O)CO)O)C)O)F)C)F\\nThe natural language question is: The molecule is a hydroxy fatty acid ascaroside anion that is the conjugate base of oscr#28, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an oscr#28.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCCCCCCC(=O)[O-])O)O\\nThe natural language question is: The molecule is the R- (less active) enantiomer of ketamine. It has a role as an intravenous anaesthetic, an analgesic and a NMDA receptor antagonist. It is an enantiomer of an esketamine.\\nThe corresponding SMILES representation is:\\nCN[C@]1(CCCCC1=O)C2=CC=CC=C2Cl\\nThe natural language question is: The molecule is a 16-HETE in which the chiral centre at position 16 has S-configuration. It has a role as an anti-inflammatory agent and a human xenobiotic metabolite. It is a conjugate acid of a 16(S)-HETE(1-). It is an enantiomer of a 16(R)-HETE.\\nThe corresponding SMILES representation is:\\nCCCC[C@@H](/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the L-enantiomer of 5-hydroxytryptophan. It has a role as a human metabolite, a plant metabolite and a mouse metabolite. It is a 5-hydroxytryptophan, a hydroxy-L-tryptophan and a non-proteinogenic L-alpha-amino acid. It is an enantiomer of a 5-hydroxy-D-tryptophan. It is a tautomer of a 5-hydroxy-L-tryptophan zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20501',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a C20-gibberellin, initially identified in Lupinus luteus, that is gibberellin A12 in which extra hydroxy substituents are present at the 2beta- and 7alpha-positions. It has a role as a plant metabolite. It is a C20-gibberellin and a dicarboxylic acid. It derives from a gibberellin A12.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CC[C@@H]([C@@]([C@H]1[C@@H]([C@]34[C@H]2CC[C@](C3)(C(=C)C4)O)C(=O)O)(C)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6935',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a polyphenol that is a phloroglucinol derivative isolated from the rhizomes of Dryopteris crassirhizoma and has been shown to exhibit radical scavenging and antibacterial activity. It has a role as a metabolite, a radical scavenger and an antibacterial agent. It is a beta-hydroxy ketone, a polyphenol and an aromatic ketone. It derives from a phloroglucinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC(=O)C1=C(C(=C(C(=C1O)CC2=C(C(C(=O)C(=C2O)C(=O)CC)(C)C)O)O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12987',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a monocarboxylic acid that is valeric acid substituted by a phenyl group at the delta-position. It is a monocarboxylic acid and a member of benzenes. It derives from a valeric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)CCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24973',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a triterpene that is docosahydropicene substituted by 8 methyl groups at positions 2, 2, 4a, 6a, 8a, 9, 12b and 14a. It has a role as a metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1CCC[C@@H]2[C@@]1(CC[C@H]3[C@]2(CC[C@@]4([C@@]3(CC[C@@]5([C@H]4CC(CC5)(C)C)C)C)C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16056',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hydroxy monocarboxylic acid anion obtained by deprotonation of the carboxy groups of 2-hydroxy-3-carboxy-6-oxo-7-methylocta-2,4-dienoic acid; major species at pH 7.3. It derives from an octa-2,4-dienoate. It is a conjugate base of a 2-hydroxy-3-carboxy-6-oxo-7-methylocta-2,4-dienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C(=O)/C=C\\\\\\\\C(=C(/C(=O)O)\\\\\\\\[O-])\\\\\\\\C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6161',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a carbohydrate-functionalised sequence-defined oligo(amidoamine) in which an Asn-Leu-Phe-Gln-Val-Val-His-Asn-Ser-Tyr-Asn-Arg-Pro-Ala-Tyr-Ser-Pro-Gly amino acid sequence is linked via its terminal glycine residue to the amino group at C-1 of a 54-amino-7,10,18,21,29,32,40,43,51,54-decaoxo-3,6,11,14,17,22,25,28,33,36,39,44,47,50-tetradecaazatetrapentacont-1-ylamino chain, to the -NH- groups at positions 3, 14, 25, 36 and 47 of which are also linked alpha-L-rhamnosyl-(1->3)-beta-D-glucosyloxy disaccharide units via 5-(ethylsulfinyl)pentanoyl chains.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@H]2[C@@H]([C@H](O[C@H]([C@@H]2O)OCCS(=O)CCCCC(=O)N(CCNC(=O)CCC(=O)N)CCNC(=O)CCC(=O)NCCN(CCNC(=O)CCC(=O)NCCN(CCNC(=O)CCC(=O)NCCN(CCNC(=O)CCC(=O)NCCN(CCNC(=O)CNC(=O)[C@@H]3CCCN3C(=O)[C@H](CO)NC(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](C)NC(=O)[C@@H]5CCCN5C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CC6=CC=C(C=C6)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CC7=CNC=N7)NC(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CC8=CC=CC=C8)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(=O)N)N)C(=O)CCCCS(=O)CCO[C@H]9[C@@H]([C@H]([C@@H]([C@H](O9)CO)O)O[C@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)C)O)O)O)O)C(=O)CCCCS(=O)CCO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)C)O)O)O)O)C(=O)CCCCS(=O)CCO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)C)O)O)O)O)C(=O)CCCCS(=O)CCO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)C)O)O)O)O)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8328',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a limonoid that is 17-epiazadiradione substituted by a hydroxy group at position 17. Isolated from Azadirachta indica, it exhibits antineoplastic activity. It has a role as a metabolite, a plant metabolite and an antineoplastic agent. It is an acetate ester, a cyclic terpene ketone, a member of furans, a limonoid, a tetracyclic triterpenoid, a tertiary alcohol and a tertiary alpha-hydroxy ketone. It derives from a 17-epiazadiradione.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)O[C@@H]1C[C@@H]2[C@](C=CC(=O)C2(C)C)([C@@H]3[C@@]1(C4=CC(=O)[C@@]([C@@]4(CC3)C)(C5=COC=C5)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12343',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a 20-oxo steroid, a 17alpha-hydroxy steroid, a chlorinated steroid, a 3-oxo-Delta(4) steroid and a tertiary alpha-hydroxy ketone. It has a role as an androgen antagonist. It derives from a hydride of a pregnane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)[C@]1(CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2C=C(C4=CC(=O)[C@@H]5C[C@@H]5[C@]34C)Cl)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7362',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a synthetic RNA fragment comprised of five adenosine, seven guanosine, five uridine and eight cytidine residues connected by 3'->5' phosphodiester linkages in the sequence A-A-G-U-C-U-C-C-A-C-U-C-G-A-G-U-G-U-C-C-G-A-G-C-G.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CN(C(=O)N=C1N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O[C@@H]3[C@H](O[C@H]([C@@H]3O)N4C=CC(=O)NC4=O)COP(=O)(O)O[C@@H]5[C@H](O[C@H]([C@@H]5O)N6C=NC7=C6N=C(NC7=O)N)COP(=O)(O)O[C@@H]8[C@H](O[C@H]([C@@H]8O)N9C=NC1=C(N=CN=C19)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C(N=CN=C21)N)CO)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=O)NC1=O)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C(N=CN=C21)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=O)NC1=O)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C(N=CN=C21)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=O)NC1=O)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=O)NC1=O)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C(N=CN=C21)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=NC1=O)N)O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=NC2=C1N=C(NC2=O)N)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13565',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a N-acyl-L-glutamic acid and a tricarboxylic acid. It has a role as an Escherichia coli metabolite and a mouse metabolite. It is a conjugate acid of a N-(3-carboxylatopropanoyl)-L-glutamate(3-).\\nThe corresponding SMILES representation is:\\nC(CC(=O)O)[C@@H](C(=O)O)NC(=O)CCC(=O)O\\nThe natural language question is: The molecule is conjugate base of xanthommatin having both carboxy groups deprotonated and the amino group protonated; major species at pH 7.3. It is a conjugate base of a xanthommatin.\\nThe corresponding SMILES representation is:\\nC1=CC(=C2C(=C1)OC3=CC(=O)C4=C(C3=N2)C(=O)C=C(N4)C(=O)[O-])C(=O)CC(C(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is a germacranolide isolated from Neurolaena lobata and Austroeupatorium inulifolium and has been shown to exhibit antimalarial activity. It has a role as a metabolite and an antimalarial. It is a germacranolide, a fatty acid ester, an enone and a tertiary alpha-hydroxy ketone. It derives from an isovaleric acid.\\nThe corresponding SMILES representation is:\\nC[C@@H]/1C[C@@H]2[C@@H]([C@@H](C[C@](C(=O)/C=C1)(C)O)OC(=O)CC(C)C)C(=C)C(=O)O2\\nThe natural language question is: The molecule is conjugate acid of (S)-piperazine-2-carboxamide arising from selective protonation at the 4-position. It is a conjugate acid of a (S)-piperazine-2-carboxamide.\\nThe corresponding SMILES representation is:\\nC1CN[C@@H](C[NH2+]1)C(=O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is any D-hexose 6-phosphate in which the hexose is in the pyranose form. It is a conjugate acid of a D-hexopyranose 6-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1C(C(C(C(O1)O)O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5860',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a pentacarboxylic acid anion. It is a conjugate base of a pentetic acid. It is a conjugate acid of a pentetate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CN(CC(=O)O)CC(=O)O)N(CCN(CC(=O)O)CC(=O)O)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9334',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a clathrate compound that is an ice-like solid that consists of methane which is trapped within the crystal structure of water. It has formula CH4.5.75H2O or 4CH4.23H2O. It contains a methane and a water.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C.C.C.C.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6735',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a mannonic acid derivative in which the hydroxy group at position 2 has been replaced by an amino group. It derives from a D-mannonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@H]([C@@H]([C@@H](C(=O)O)N)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12594',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an anionic phospholipid obtained by deprotonation of both phosphate OH groups of 3-heptaprenyl-sn-glycero-1-phosphate; major species at pH 7.3. It is a conjugate base of a 3-heptaprenyl-sn-glycero-1-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/COC[C@@H](COP(=O)([O-])[O-])O)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9142',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of malonyl-CoA methyl ester; major species at pH 7.3. It is a conjugate base of a malonyl-CoA methyl ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])C(C(=O)NCCC(=O)NCCSC(=O)CC(=O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15764',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a dipeptide that is the N-(L-alpha-aspartyl) derivative of L-isoleucine. It has a role as a human urinary metabolite. It derives from a L-aspartic acid and a L-isoleucine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@@H](C(=O)O)NC(=O)[C@H](CC(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13840',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the diphosphate OH groups of alpha-D-mannosyl-(1->3)-N-acetyl-alpha-D-glucosaminyl-1-diphospho-ditrans,polycis-undecaprenol. Major species at pH 7.3. It is a conjugate base of an alpha-D-mannosyl-(1->3)-N-acetyl-alpha-D-glucosaminyl-1-diphospho-ditrans,polycis-undecaprenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])OP(=O)([O-])O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@@H]2[C@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15391',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a quercetin O-glucoside in which quercetin is attached to a alpha-L-arabinofuranosyl group at position 3 via a glycosidic linkage while the hydroxy group at position 5 is replaced by a galloyl group. Isolated from the young leaves of Calycolpus warscewiczianus, it exhibits activity against a chloroquine-resistant strain of Plasmodium falciparum. It has a role as a metabolite and an antiplasmodial drug. It is a glycosyloxyflavone, a trihydroxyflavone, an alpha-L-arabinofuranoside and a monosaccharide derivative. It derives from a gallic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)C(=O)C4=CC(=C(C(=C4)O)O)O)O[C@H]5[C@@H]([C@H]([C@@H](O5)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20379',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is the 11-hydroperoxy derivative of linoleic acid having (S)-configuration. It derives from a linoleic acid. It is a conjugate acid of an (11S)-11-hydroperoxylinoleate. It is an enantiomer of an (11R)-11-hydroperoxylinoleic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\[C@@H](/C=C\\\\\\\\CCCCCCCC(=O)O)OO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12083',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a glycosyloxyflavone that is isovitexin in which the hydroxyl hydrogen at position 7 is replaced by a 6-(6-methoxycaffeoyl)glucosyl residue. It has a role as a metabolite. It is a C-glycosyl compound, a cinnamate ester, a dihydroxyflavone and a glycosyloxyflavone. It derives from an isovitexin and a trans-caffeic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=C(C=C1/C=C/C(=O)OC[C@@H]2[C@H]([C@@H]([C@H](C(O2)OC3=C(C(=C4C(=C3)OC(=CC4=O)C5=CC=C(C=C5)O)O)[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19013',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a dicarboxylic acid dianion resulting from the deprotonation of both ot the carboxy groups of 5,6,6'-trihydroxy-5'-methoxy[biphenyl]-3,3'-dicarboxylic acid. The major microspecies at pH 7.3. It is an aromatic carboxylate and a dicarboxylic acid dianion. It is a conjugate base of a 5,6,6'-trihydroxy-5'-methoxy[biphenyl]-3,3'-dicarboxylic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC(=CC(=C1[O-])C2=C(C(=CC(=C2)C(=O)O)O)[O-])C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11091',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an amino disaccharide consisting of 2-acetamido-2-deoxy-alpha-D-glucopyranose and beta-D-glucopyranose residues joined in sequence by a (1->2) glycosidic linkage. It is an amino disaccharide and a member of acetamides. It derives from a beta-D-glucose and a N-acetyl-alpha-D-glucosamine.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2O)CO)O)O)CO)O)O\\nThe natural language question is: The molecule is a 3-methylxanthine tautomer where the imidazole proton is located at the 9-position. It has a role as a metabolite. It is a tautomer of a 3-methyl-7H-xanthine.\\nThe corresponding SMILES representation is:\\nCN1C2=C(C(=O)NC1=O)NC=N2\\nThe natural language question is: The molecule is a monounsaturated fatty acid that is nonadecanoic acid that has been dehydrogenated to introduce a double bond with Z configuration between the carbons at positions 9 and 10. It has been isolated from the spores of reishi mushroom, Ganoderma lucidum, and found to inhibit tumour cell proliferation and induce apoptosis of HL-60 cells. It has a role as a fungal metabolite, an apoptosis inducer and an antineoplastic agent. It is a long-chain fatty acid, a monounsaturated fatty acid and a straight-chain fatty acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)O\\nThe natural language question is: The molecule is a branched amino oligosaccharide that is a tridecasaccharide derivative consisting of a linear trisaccharide of beta-D-mannose and two N-acetyl-beta-D-glucosamine residues (one of which is at the reducing end) all linked in sequence (1->4), to the mannosyl residue of which are linked (1->3) and (1->6) two beta-D-galactosyl-(1->4)-N-acetyl-beta-D-glucosaminyl-(1->3)-beta-D-galactosyl-(1->4)-N-acetyl-beta-D-glucosaminyl-(1->2)-alpha-D-mannosyl linear pentasaccharide units. It is an amino oligosaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O[C@H]6[C@@H]([C@H]([C@H]([C@H](O6)CO)O)O[C@H]7[C@@H]([C@H]([C@@H]([C@H](O7)CO)O[C@H]8[C@@H]([C@H]([C@H]([C@H](O8)CO)O)O)O)O)NC(=O)C)O)O)NC(=O)C)O)O[C@@H]9[C@H]([C@H]([C@@H]([C@H](O9)CO)O)O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)O)NC(=O)C)O)O)NC(=O)C)O)O)NC(=O)C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a compound of ammonium, iron and sulfate in which the ratio of ammonium to iron(2+) to sulfate ions is 2:1:2. It is a metal sulfate, an iron molecular entity and an ammonium salt. It contains an iron(2+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[NH4+].[NH4+].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Fe+2]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2588',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is the zwitterion of L-prolinylglycine resulting from the transfer of a proton from the hydroxy group of glycine to the amino group of proline. Major microspecies at pH 7.3. It is a tautomer of a L-prolylglycine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[C@H]([NH2+]C1)C(=O)NCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26584',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a tripeptide consisting of an Ile-Ile-Thr-NH2 sequence N-substituted on the threonamide amidic nitrogen with a (2S)-4-methyl-1-[(2R)-2-methyloxiran-2-yl]-1-oxopentan-2-yl group and with acetyl and methyl groups on the nitrogen of the isoleucine residue distal to the threonamide; a naturally occurring selective proteasome inhibitor with anti-inflammatory activity. It has a role as a proteasome inhibitor. It is a member of morpholines and a tripeptide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@@H](C(=O)N[C@@H]([C@@H](C)O)C(=O)N[C@@H](CC(C)C)C(=O)[C@]1(CO1)C)NC(=O)[C@H]([C@@H](C)CC)N(C)C(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12104',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a butanediol that is butane in which one hydrogen of each of the methyl groups is substituted by a hydroxy group. A colourless, water-miscible, viscous liquid at room temperature (m.p. 16℃) with a high boiling point (230℃), it is mainly used for the production of other organic chemicals, particularly the solvent oxolane (also known as tetrahydrofuran or THF). It has a role as a neurotoxin, a protic solvent and a prodrug. It is a butanediol and a glycol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCO)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19507',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a D-galactopyranose having beta-configuration at the anomeric centre. It has a role as an epitope and a mouse metabolite. It is an enantiomer of a beta-L-galactose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25168',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of the class of phosphonic acids that is phosphonic acid in which the hydrogen attached to the phosphorus is substituted by a 1-aminocyclopropyl group. It derives from a phosphonic acid. It is a conjugate acid of a 1-aminocyclopropylphosphonate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC1(N)P(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22293',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hexacosapentaenoate that is the conjugate base of (11Z,14Z,17Z,20Z,23Z)-hexacosapentaenoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an (11Z,14Z,17Z,20Z,23Z)-hexacosapentaenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1591',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a organophosphate oxoanion obtained by deprotonation of the phosphate OH groups of 1D-myo-inositol 6-phosphate. It is an inositol phosphate oxoanion and a myo-inositol phosphate(2-). It is a conjugate base of a 1D-myo-inositol 6-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@@H]1([C@H](C([C@H]([C@@H](C1O)O)O)OP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6171',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a tirucallane triterpenoid that is tirucalla-7,24-diene substituted by hydroxy groups at positions 3 and 22 and an oxo group at position 23. It has been isolated from Dysoxylum lenticellatum. It has a role as a plant metabolite. It is a tirucallane triterpenoid and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@@H]1CC[C@]2([C@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(CC[C@@H](C4(C)C)O)C)C)C)[C@@H](C(=O)C=C(C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10026',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a heterodetic cyclic peptide consisting of N-acylated trytophan, 3,5-dichloro-4-hydroxyphenylglycine, 4-hydroxyphenylglycine, 3,5-dichloro-4-hydroxyphenylglycyl, tyrosine and 4-hydroxyphenylglycine residues joined in sequence and in which the side-chain of the central 4-hydroxyphenylglycine residue is attached to the side-chain of the tryptophan via a C3-C6 bond and to the side-chain of the tyrosine via an ether bond from C5. It is isolated from the culture broth of Streptomyces and has anti-HIV-1 activity. It has a role as a metabolite, an antimicrobial agent and an anti-HIV-1 agent. It is a member of indoles, a cyclic ether, a heterodetic cyclic peptide, an organochlorine compound, a peptide antibiotic and a polyphenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1[C@@H](CC2=CC=C(C=C2)OC3=CC4=CC(=C3O)C5=CC6=C(C=C5)C(=CN6)C[C@H](C(=O)N[C@@H](C(=O)N[C@H]4C(=O)N[C@@H](C1=O)C7=CC(=C(C(=C7)Cl)O)Cl)C8=CC(=C(C(=C8)Cl)O)Cl)NC(=O)C(=O)C9=CC(=C(C(=C9)Cl)O)Cl)C(=O)N[C@H](C1=CC=C(C=C1)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21570',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a tetrapeptide that consists of [(N(6)-lysyl)-N(6)-lysyl]-lysyl-lysinamide where the two side-chain amino functions are acylated by 4-(indol-3-yl)butanoyl and 6-[(5-nitro-2-furoyl)amino]hexanoyl groups. It is a tetrapeptide, a C-nitro compound, a member of furans and a member of indoles.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=CN2)CCCC(=O)NCCCCC(C(=O)N)NC(=O)C(CCCCNC(=O)C(CCCCNC(=O)C(CCCCNC(=O)CCCCCNC(=O)C3=CC=C(O3)[N+](=O)[O-])N)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10872',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a tetracyclic triterpenoid that is dammarane with an exocyclic double bond at C-25 and substituent hydroxy groups at positions 3, 6, 12, 20, 22 and 24 (the 3beta,6alpha,12beta,24S stereoisomer). It is isolated from the leaves of Panax ginseng and exhibits cytotoxicity in the human hepatoma cell line, HepG2. It has a role as a metabolite and an antineoplastic agent. It is a tetracyclic triterpenoid, a hexol and a tertiary alcohol. It derives from a hydride of a dammarane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=C)[C@H](CC([C@@](C)([C@H]1CC[C@@]2([C@@H]1[C@@H](C[C@H]3[C@]2(C[C@@H]([C@@H]4[C@@]3(CC[C@@H](C4(C)C)O)C)O)C)O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19648',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an organic trisulfide that is trisulfane in which both of the hydrogens are replaced by allyl groups. A component of the essential oil of garlic and a major component of the traditional Chinese medicine allitridium, it exhibits antifungal, antitumour and antioxidant activity It has a role as an apoptosis inducer, an estrogen receptor antagonist, an antineoplastic agent, a vasodilator agent, an antioxidant, an anti-inflammatory agent, an insecticide, an antiprotozoal drug, a platelet aggregation inhibitor and an antilipemic drug.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C=CCSSSCC=C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21361',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is (4R)-Limonene hydroperoxide where the hydroperoxy group is located at position 2 of the limonene skeleton; one of the two main allergenic hydroperoxides formed by autoxidation of (4R)-limonene. It has a role as an allergen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC[C@@H](CC1OO)C(=C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26262',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an amino pentasaccharide epitope consisting of three 3-deoxy-D-manno-oct-2-ulose residues and two N-acetylglucosamine residues (one at the reducing end) in a linear sequence. It has a role as an epitope. It is an amino pentasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1O)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO[C@@]3(C[C@H]([C@H]([C@H](O3)[C@@H](CO)O)O)O[C@@]4(C[C@H]([C@H]([C@H](O4)[C@@H](CO[C@@]5(C[C@H]([C@H]([C@H](O5)[C@@H](CO)O)O)O)C(=O)O)O)O)O)C(=O)O)C(=O)O)O)O)NC(=O)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_881',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a dihydroxybenzenesulfonic acid that is resorcinol in which the hydrogen ortho- to both of the hydroxy groups is replaced by a sulfonic acid group. It has a role as a metabolite. It derives from a resorcinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1)O)S(=O)(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13176',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is the amino dicarboxylic acid that is heptanedioic acid with amino substituents at C-2 and C-6. It has a role as an Escherichia coli metabolite. It derives from a pimelic acid. It is a conjugate acid of a 2,6-diaminopimelate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(C(=O)O)N)CC(C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13283',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an ammonium ion obtained by the protonation of the tertiary amino group of fumigaclavine C; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a fumigaclavine C.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C[NH+]([C@@H]2CC3=C(NC4=CC=CC(=C34)[C@H]2[C@H]1OC(=O)C)C(C)(C)C=C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6426',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a 1-alkyl-2-acetyl-3-acyl-sn-glycerol in which the alkyl and acyl groups are specified as palmityl and palmitoyl. It derives from a 1-O-palmityl-2-acetyl-sn-glycerol and a hexadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCOC[C@H](COC(=O)CCCCCCCCCCCCCCC)OC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16485',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an oxo monocarboxylic acid that is (+)-7-isojasmonic acid in which one of the hydrogens of the side-chain methyl group is replaced by a hydroxy group. It has a role as a member of jasmonates and a plant metabolite. It is a member of cyclopentanones, a primary alcohol, an oxo monocarboxylic acid and a homoallylic alcohol. It derives from a (+)-7-isojasmonic acid. It is a conjugate acid of a tuberonate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC(=O)[C@H]([C@H]1CC(=O)O)C/C=C\\\\\\\\CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4094',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a lignan isolated from the stems of Sinocalamus affinis. It has a role as a plant metabolite. It is a lignan, an organic heterotricyclic compound, an oxacycle, a dimethoxybenzene and a member of phenols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1(OC[C@@H]2CC3=CC(=C(C(=C3[C@@H]([C@H]2CO1)C4=CC(=C(C(=C4)OC)O)OC)OC)O)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_193',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a glycol that is octacosane bearing two hydroxy substituents located at positions 1 and 2. It derives from an octacosane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCC(CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17907',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a dimethylbenzoate in which the two methyl groups are located at positions 2 and 3. It derives from a benzoate. It is a conjugate base of a 2,3-dimethylbenzoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=CC=C1)C(=O)[O-])C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2693',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an S-acyl-4'-phosphopantetheine obtained by formal condensation of the thiol group of D-pantetheine 4'-phosphate with the carboxy group of tetradecanoic acid. It has a role as a mouse metabolite. It derives from a tetradecanoic acid. It is a conjugate acid of a S-tetradecanoyl-4'-phosphopantetheine(2-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)O)O\\nThe natural language question is: The molecule is a glycosylmannose consisting of alpha-D-galactopyranose and alpha-D-mannopyranose residues joined in sequence by a (1->3) glycosidic bond. It derives from an alpha-D-galactose and an alpha-D-mannose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O)O)O[C@@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O\\nThe natural language question is: The molecule is propane-1,3-diamine in which a hydrogen attached to one nitrogen is substituted by a 3-aminoprop-1-yl group, and a hydrogen attached to the other nitrogen is substituted by a 4-aminobut-1-yl group. A polyamine natural product, its name arises from its similarity to spermine and the fact that it was first isolated from the extreme thermophile, Thermus thermophilus. It is a polyazaalkane and a tetramine. It is a conjugate base of a thermosperminium(4+).\\nThe corresponding SMILES representation is:\\nC(CCNCCCNCCCN)CN\\nThe natural language question is: The molecule is the conjugate base of (S)-3-hydroxybutyric acid. It is a conjugate base of a (S)-3-hydroxybutyric acid. It is an enantiomer of a (R)-3-hydroxybutyrate.\\nThe corresponding SMILES representation is:\\nC[C@@H](CC(=O)[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an omega-hydroxy fatty acid that is 21-hydroxyhenicosanoic acid that has been dehydrogenated to introduce a trans double bond at the 2-3 position. It is an omega-hydroxy fatty acid, an alpha,beta-unsaturated monocarboxylic acid, a long-chain fatty acid, a straight-chain fatty acid and a hydroxy monounsaturated fatty acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C(CCCCCCCCCO)CCCCCCCC/C=C/C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22671',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a nitrobenzoic acid having the nitro group at the 4-position. It derives from a benzoic acid. It is a conjugate acid of a 4-nitrobenzoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(=O)O)[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22447',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an aromatic ketone that is propan-1-one substituted by a 2,4-dihydroxy-3,5-dimethoxyphenyl group at position 1. It has been isolated from the leaves of Garcia parviflora. It has a role as a plant metabolite. It is a dimethoxybenzene, a member of resorcinols and an aromatic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(=O)C1=CC(=C(C(=C1O)OC)O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18422',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a quaternary ammonium salt resulting from the reaction of 2,4-D with choline. It is a post-emergence herbicide used for selective control of broadleaf weeds. It has a role as a phenoxy herbicide, a synthetic auxin and an agrochemical. It contains a (2,4-dichlorophenoxy)acetate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[N+](C)(C)CCO.C1=CC(=C(C=C1Cl)Cl)OCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2923',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a tetracyclic diterpenoid with formula C20H22O5, originally isolated from the dried root outer bark of Tripterygium hypoglaucum. It has a role as a plant metabolite. It is a member of benzenediols, a cyclic ketone, a tetracyclic diterpenoid and a gamma-lactone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C1=CC(=C2C(=C1O)C(=O)CC3[C@@]2(CCC4=C3COC4=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10627',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an N-alkylpiperazine carrying (4-chlorophenyl)(phenyl)methyl and 4-tert-butylbenzyl groups. It has a role as an antiemetic, a cholinergic antagonist, a histamine antagonist, a local anaesthetic and a central nervous system depressant. It is a N-alkylpiperazine and a member of monochlorobenzenes. It is a conjugate base of a buclizine(2+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C)C1=CC=C(C=C1)CN2CCN(CC2)C(C3=CC=CC=C3)C4=CC=C(C=C4)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19320',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a thiouridine in which the oxygen replaced by sulfur is that at C-2. It is a thiouridine and a nucleoside analogue.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CN(C(=S)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24186',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a monocarboxylic acid that is butyric acid in which one of the hydrogens at position 4 has been replaced by a thiol group. It is a thiol and a monocarboxylic acid. It is a conjugate acid of a 4-sulfanylbutanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)O)CS'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13092',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is conjugate acid of beta-alaninamide arising from protonation of the beta-amino group. It is a conjugate acid of a beta-alaninamide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C[NH3+])C(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5789',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a steroid acid that is 5alpha-cholest-7-en-26-oic acid that has S configuration at position 25 and is substituted by a hydroxy group at the 3alpha position. It is an endogenous ligand for DAF-12 in Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a 3alpha-hydroxy steroid, a steroid acid, a monocarboxylic acid and a cholestanoid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC[C@H](C)C(=O)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(CC[C@H](C4)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25640',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a cyclohexadienediol in which the two hydroxy groups are ortho and cis to one another, with chlorine atoms on each of the other four ring carbons. It is a cyclohexadienediol and an organochlorine compound. It derives from a 1,2,3,4-tetrachlorobenzene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@H]1([C@@H](C(=C(C(=C1Cl)Cl)Cl)Cl)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9461',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a dioxo monocarboxylic acid anion obtained by the deprotonation of the carboxylic group of globostellatic acid A. It is a conjugate base of a globostellatic acid A.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=C\\\\\\\\C=C\\\\\\\\C(C)(C)O)/C=C/C(=O)/C(=C/1\\\\\\\\C(=O)C[C@@H]2[C@@]1(CC[C@@H]3[C@@]2(CC[C@H]([C@]3(C)C(=O)[O-])OC(=O)C)C)C)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3811',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a N-acyl-beta-D-galactosylsphingosine in which the acyl group is tetracosananoyl. It has a role as a mouse metabolite. It derives from a tetracosanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)[C@@H](/C=C/CCCCCCCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5152',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a medium-chain fatty acid that is the 12-hydroxylated derivative of lauric acid. It has a role as a human metabolite. It is an omega-hydroxy fatty acid and a medium-chain fatty acid. It derives from a dodecanoic acid. It is a conjugate acid of a 12-hydroxylaurate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCC(=O)O)CCCCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4747',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a natural product found in Azadirachta indica. It has a role as a metabolite and a plant metabolite. It is an acetate ester, a cinnamate ester, an organic heteropentacyclic compound, a member of furans, a limonoid and a methyl ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C2[C@@H](C[C@H]1C3=COC=C3)O[C@H]4[C@@]2([C@@H]([C@]5([C@H](C[C@H]([C@@]6([C@@H]5[C@H]4OC6)C)OC(=O)C)OC(=O)/C=C/C7=CC=CC=C7)C)CC(=O)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1130',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organophosphate oxoanion that is the conjugate base of carbamoyl adenylate, obtained by deprotonation of the phosphate group. It is a conjugate base of a carbamoyl adenylate.\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OC(=O)N)O)O)N\\nThe natural language question is: The molecule is a 1-phenyl-2,3,4,5-tetrahydro-1H-3-benzazepine-7,8-diol that is the S-enantiomer of SKF 38393. It is a conjugate base of a (S)-SKF 38393(1+). It is an enantiomer of a (R)-SKF 38393.\\nThe corresponding SMILES representation is:\\nC1CNC[C@H](C2=CC(=C(C=C21)O)O)C3=CC=CC=C3\\nThe natural language question is: The molecule is a 2-trans-abscisate obtained by removal of a proton from the carboxy group of (S)-2-trans-abscisic acid. It is a conjugate base of a (S)-2-trans-abscisic acid. It is an enantiomer of a (R)-2-trans-abscisate.\\nThe corresponding SMILES representation is:\\nCC1=CC(=O)CC([C@]1(/C=C/C(=C/C(=O)[O-])/C)O)(C)C\\nThe natural language question is: The molecule is a member of the class of xanthones that is 9H-xanthen-9-one substituted by hydroxy groups at positions 2, 3, 6 and 8, an isoprenyl group at position 1 and a 2-methylbut-3-en-2-yl group at position 5. It is isolated from the root barks of Cudrania tricuspidata and exhibits cytotoxicity towards human cancer cell lines. It has a role as a metabolite, an antineoplastic agent, an EC 1.14.99.1 (prostaglandin-endoperoxide synthase) inhibitor and an anti-inflammatory agent. It is a member of xanthones and a polyphenol.\\nThe corresponding SMILES representation is:\\nCC(=CCC1=C(C(=CC2=C1C(=O)C3=C(O2)C(=C(C=C3O)O)C(C)(C)C=C)O)O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a lipid A derivative, prepared from Campylobacter jejuni HS:19, in which each of its two glucosaminyl units is substituted on nitrogen by a 3-(2-hydroxytetracosanoyloxy)octadecanoyl unit.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCC(C(=O)O[C@H](CCCCCCCCCCCCCCC)CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)(O)O)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)OP(=O)(O)O)O)NC(=O)C[C@@H](CCCCCCCCCCCCCCC)OC(=O)C(CCCCCCCCCCCCCCCCCCCCCC)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_53',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a ketone that is decane in which the methylene hydrogens at position 3 are replaced by an oxo group. It has a role as a food additive and a metabolite. It derives from a hydride of a decane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC(=O)CC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1126',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of lipid As that is lipid IVA in which one of the free OH groups on the two N-hydroxytetradecanoyl groups is carrying a palimitoyl group. It derives from a lipid IVA.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)O[C@H](CCCCCCCCCCC)CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)(O)O)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)OP(=O)(O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)NC(=O)C[C@@H](CCCCCCCCCCC)O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O\\nThe natural language question is: The molecule is a 13 amino acid peptide hormone which is found in the central nervous system and the gastrointestinal tract. It behaves as a neurotransmitter in the brain, as a hormone in the gut, and also as a neuromodulator. It is implicated in the pathophysiology of several CNS disorders (including schizophrenia, Parkinson's disease, drug abuse, pain, cancer, inflammation, eating disorders and central control of blood pressure) due to its association with a wide variety of neurotransmitter systems such as dopaminergic, sertonergic, glutamatergic, GABAergic, and cholinergic systems. It has a role as a human metabolite, a mitogen, a neurotransmitter and a vulnerary. It is a conjugate base of a neurotensin(1+).\\nThe corresponding SMILES representation is:\\nCC[C@H](C)[C@@H](C(=O)N[C@@H](CC(C)C)C(=O)O)NC(=O)[C@H](CC1=CC=C(C=C1)O)NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@@H]3CCCN3C(=O)[C@H](CCCCN)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]5CCC(=O)N5\\nThe natural language question is: The molecule is a flavonoid oxoanion obtained by deprotonation of the 5-hydroxy group of 7-O-methylvitexin 2''-O-alpha-L-rhamnoside. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of a 7-O-methylvitexin 2''-O-alpha-L-rhamnoside.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2C3=C(C=C(C4=C3OC(=CC4=O)C5=CC=C(C=C5)O)[O-])OC)CO)O)O)O)O)O\\nThe natural language question is: The molecule is a 3-oxo-fatty acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of 3-oxohexacosanoyl-CoA. It is a 3-oxo-fatty acyl-CoA(4-), an 11,12-saturated fatty acyl-CoA(4-) and a very long-chain 3-oxoacyl-CoA(4-). It is a conjugate base of a 3-oxohexacosanoyl-CoA.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an N-glycosyl compound that is kinetin in which an alpha-D-glucopyranosyl residue is attached at position N-9. It has a role as a cytokinin. It is a N-glycosyl compound, a 6-alkylaminopurine and a member of furans. It derives from a kinetin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=COC(=C1)CNC2=C3C(=NC=N2)N(C=N3)[C@@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5926',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of lactic acid, arising from deprotonation of the carboxy group. It has a role as a human metabolite and an Escherichia coli metabolite. It derives from a propionate. It is a conjugate base of a rac-lactic acid and a 2-hydroxypropanoic acid.\\nThe corresponding SMILES representation is:\\nCC(C(=O)[O-])O\\nThe natural language question is: The molecule is a nucleotide-sugar oxoanion obtained by deprotonation of the diphosphate OH groups of dTDP-4-dehydro-2,6-dideoxy-alpha-D-glucose; major species at pH 7.3. It has a role as a bacterial metabolite. It derives from a dTDP-alpha-D-glucose(2-). It is a conjugate base of a dTDP-4-dehydro-2,6-dideoxy-alpha-D-glucose.\\nThe corresponding SMILES representation is:\\nC[C@@H]1C(=O)[C@@H](C[C@H](O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O\\nThe natural language question is: The molecule is a sulfonamide that is benzenesulfonamide substituted by an acetylamino group at position 4 and a pyrimidin-2-yl group at the nitrogen atom. It is a metabolite of the drug sulfadiazine. It has a role as a marine xenobiotic metabolite. It is a sulfonamide, a member of acetamides and a member of pyrimidines.\\nThe corresponding SMILES representation is:\\nCC(=O)NC1=CC=C(C=C1)S(=O)(=O)NC2=NC=CC=N2\\nThe natural language question is: The molecule is a tetrol consisting of 1-iminohexane with four hydroxy substituents placed at positions 2, 3, 4 and 5. It is a tetrol and an aldimine. It derives from a hydride of a hexane.\\nThe corresponding SMILES representation is:\\nCC(C(C(C(C=N)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a sulfonamide, a member of tryptamines and a heteroarylpiperidine. It has a role as a serotonergic agonist and a vasoconstrictor agent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CNS(=O)(=O)CCC1=CC2=C(C=C1)NC=C2C3CCN(CC3)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2943',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a heterodetic cyclic peptide that is a cyclic enkephalin analogue, having D-penicillaminyl residues located at positions 2 and 5, which form the heterocycle via a disulfide bond. It has a role as a delta-opioid receptor agonist.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1([C@H](C(=O)NCC(=O)N[C@H](C(=O)N[C@H](C(SS1)(C)C)C(=O)O)CC2=CC=CC=C2)NC(=O)[C@H](CC3=CC=C(C=C3)O)N)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19893',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the pyranose form of D-galactose 6-phosphate. It has a role as a metabolite. It is a D-galactose 6-phosphate and a D-hexopyranose 6-phosphate. It is a conjugate acid of a D-galactopyranose 6-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H](C(O1)O)O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20523',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of 6-hydroxy-3-isopropenylheptanoic acid, arising from deprotonation of the carboxy group. Product of the hydrolysis of 4-isopropenyl-7-methyloxepan-2-one. It is a hydroxy fatty acid anion and a branched-chain fatty acid anion. It is a conjugate base of a 6-hydroxy-3-isopropenylheptanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCC(CC(=O)[O-])C(=C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16402',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a retinoid that consists of all-trans-retinoic acid bearing two hydroxy substituents at positions 4 and 18. It is a retinoid, a dihydroxy monocarboxylic acid and a secondary allylic alcohol. It derives from an all-trans-retinoic acid. It is a conjugate acid of an all-trans-4,18-dihydroxyretinoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=C\\\\\\\\C=C\\\\\\\\C(=C\\\\\\\\C(=O)O)\\\\\\\\C)/C=C/C1=C(C(CCC1(C)C)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3737',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an N-substituted diamine that is ethylenediamine in which the four amino hydrogens are replaced by 2-pyridylmethyl groups. It has a role as a chelator and an apoptosis inducer. It is a member of pyridines, a tertiary amino compound and a N-substituted diamine. It derives from an ethylenediamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=NC(=C1)CN(CCN(CC2=CC=CC=N2)CC3=CC=CC=N3)CC4=CC=CC=N4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26204',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an L-tryptophan derivative that is the ester obtained by formal condensation of the carboxy group of L-tryptophan with the 3'-hydroxy group of AMP. It has a role as a Mycoplasma genitalium metabolite. It is an adenosine 5'-phosphate, a L-tryptophan derivative, an alpha-amino acid ester and a purine ribonucleoside 5'-monophosphate. It derives from an adenosine 5'-monophosphate.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O[C@@H]3[C@H](O[C@H]([C@@H]3O)N4C=NC5=C(N=CN=C54)N)COP(=O)(O)O)N\\nThe natural language question is: The molecule is a triterpenoid saponin that is 19alpha-hydroxyasiatic acid attached to a beta-D-glucopyranosyl residue at position 28 via a glycosidic linkage. It has been isolated from the leaves of Rosa laevigata. It has a role as a plant metabolite. It is a triterpenoid saponin, a pentacyclic triterpenoid, a monosaccharide derivative, a beta-D-glucoside and a tetrol. It derives from a 19alpha-hydroxyasiatic acid. It derives from a hydride of an ursane.\\nThe corresponding SMILES representation is:\\nC[C@@H]1CC[C@@]2(CC[C@@]3(C(=CC[C@H]4[C@]3(CC[C@@H]5[C@@]4(C[C@H]([C@@H]([C@@]5(C)CO)O)O)C)C)[C@@H]2[C@]1(C)O)C)C(=O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O\\nThe natural language question is: The molecule is an organophosphate oxoanion that is a trianion obtained via deprotonation of the carboxy and phosphate OH groups of 6-phospho-2-dehydro-3-deoxy-D-galactonic acid; major species at pH 7.3. It is an organophosphate oxoanion, a carbohydrate acid derivative anion and a monocarboxylic acid anion. It is a conjugate base of a 6-phospho-2-dehydro-3-deoxy-D-galactonic acid.\\nThe corresponding SMILES representation is:\\nC([C@H]([C@@H](COP(=O)([O-])[O-])O)O)C(=O)C(=O)[O-]\\nThe natural language question is: The molecule is a p-menthane monoterpenoid in which p-menthane carries hydroxy groups at C-3 and C-8. It derives from a hydride of a p-menthane.\\nThe corresponding SMILES representation is:\\nCC1CCC(C(C1)O)C(C)(C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a UDP-D-galactose(2-) in which the anomeric centre of the galactose moiety has alpha-configuration. It is an UDP-D-galactose(2-) and an UDP-monosaccharide(2-). It is a conjugate base of an UDP-alpha-D-galactose.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])OP(=O)([O-])O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7468',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an acetate ester obtained by formal acetylation of the tertiary hydroxy group of 2-[(8S)-2-oxo-8,9-dihydro-2H-furo[2,3-h][1]benzopyran-8-yl]propan-2-ol. It has a role as a plant metabolite. It is a furanocoumarin and an acetate ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)OC(C)(C)[C@@H]1CC2=C(O1)C=CC3=C2OC(=O)C=C3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1073',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a D-alpha-amino acid zwitterion that is D-lysopine arising from transfer of two protons from the carboxy to the amino groups; the major species at pH 7.3. It is a tautomer of a D-lysopine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](C(=O)[O-])[NH2+][C@@H](CCCC[NH3+])C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21800',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a tripeptide composed of two L-alanyl units and an L-serine joined by peptide linkages. It has a role as a metabolite. It derives from a L-alanine and a L-serine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](C(=O)N[C@@H](C)C(=O)N[C@@H](CO)C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_724',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hydroxy-cannabidiol that is cannabidiol in which one of the two hydrogens at position 2 of the pentyl chain has been replaced by a hydroxy group. It is a metabolite of cannabidiol by human liver microsomes, produced by CYP3A. It has a role as a human xenobiotic metabolite. It is a hydroxy-cannabidiol, an olefinic compound, a member of resorcinols and a secondary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC(CC1=CC(=C(C(=C1)O)[C@@H]2C=C(CC[C@H]2C(=C)C)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23528',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a steroidal acyl-CoA(4-) oxoanion obtained by deprotonation of the phosphate and diphosphate OH groups of 12alpha-hydroxy-3-oxochola-4,6-dien-24-oyl-CoA; major species at pH 7.3. It is a conjugate base of a 12alpha-hydroxy-3-oxochola-4,6-dien-24-oyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)[C@H]4CC[C@@H]5[C@@]4([C@H](C[C@H]6[C@H]5C=CC7=CC(=O)CC[C@]67C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10282',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a tetracyclic triterpenoid that is 29-nordammara-1,17(20)-diene substituted by acetoxy groups at positions 6 and 16, a carboxy group at position 21, a hydroxy group at position 25 and oxo groups at positions 3 and 7 respectively. It is isolated from the marine-derived fungal strain Aspergillus sydowii PFW1-13 and exhibits antibacterial activity. It has a role as an antibacterial agent and an Aspergillus metabolite. It is a tetracyclic triterpenoid, an acetate ester, a tertiary alcohol, an oxo monocarboxylic acid, a cyclic ketone, a 3-oxo-Delta(1) steroid and an alpha,beta-unsaturated monocarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]2[C@@H](C(=O)[C@]3([C@H]([C@]2(C=CC1=O)C)CC[C@@H]\\\\\\\\4[C@@]3(C[C@@H](/C4=C(/CCCC(C)(C)O)\\\\\\\\C(=O)O)OC(=O)C)C)C)OC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27945',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an organic heterohexacyclic compound and spirooxindole-type pyranopyrimidine spiro compound in which the shared atom of the spiro system is the carbon at position 3 of 1-allyl-7-fluoro-1,3-dihydro-2H-indol-2-one. It has a role as an antineoplastic agent. It is an organic heterohexacyclic compound, an organofluorine compound, a spiro compound and a member of oxindoles.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=NC2=C(C(=O)N1C(=O)C)[C@]3(C4=C(C(=CC=C4)F)N(C3=O)CC=C)C5=C(O2)C6=C(C=CC(=C6)F)OC5=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14397',\n", + " 'prompt': \"Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a primary arylamine that is aniline in which the hydrogens at the 3- and 4-positions are replaced by methyl groups. A low-melting, crystalline solid, it is used in the production of vitamin B2, dyes, pesticides and other chemicals. It is a dimethylaniline and a primary arylamine.\\nThe corresponding SMILES representation is:\\nCC1=C(C=C(C=C1)N)C\\nThe natural language question is: The molecule is a pyrimidine 2'-deoxyribonucleoside 5'-monophosphate having 5-(4,5-dihydroxypentyl)uracil as the nucleobase. It has a role as a Mycoplasma genitalium metabolite. It derives from a dUMP.\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H](O[C@H]1N2C=C(C(=O)NC2=O)CCCC(CO)O)COP(=O)(O)O)O\\nThe natural language question is: The molecule is dianion of 3-sulfino-L-alanine arising from deprotonation of carboxy and sulfinate groups. It is a L-alpha-amino acid anion and an alkanesulfinate. It is a conjugate base of a 3-sulfino-L-alanine(1-).\\nThe corresponding SMILES representation is:\\nC([C@@H](C(=O)[O-])N)S(=O)[O-]\\nThe natural language question is: The molecule is a member of pyrazines, a member of guanidines and a Cypridina luciferin. It has a role as a member of oxidized luciferins. It is a conjugate base of an oxidized Cypridina luciferin(1+).\\nThe corresponding SMILES representation is:\\nCC[C@H](C)C(=O)NC1=NC=C(N=C1CCCN=C(N)N)C2=CNC3=CC=CC=C32\\nNext, you will be given a sample for test.The natural language question is: The molecule is a C-glycosyl compound that is 1,8-dihydroxy-3-methylanthracen-9(10H)-one substituted by a 1-O-acetyl-3-O-senecioyl-alpha-L-lyxopyranosyl moiety at position 10 via a C-glycosidic linkage (the 10R stereoisomer). It is isolated from the leaves of Alvaradoa haitiensis and exhibits cytotoxicity against human oral epidermoid carcinoma. It has a role as a metabolite and an antineoplastic agent. It is a C-glycosyl compound, an acetate ester, a member of anthracenes and a polyphenol. It derives from a 3-methylbut-2-enoic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=CC2=C(C(=C1)O)C(=O)C3=C([C@H]2[C@H]4[C@@H]([C@H]([C@H]([C@@H](O4)OC(=O)C)O)OC(=O)C=C(C)C)O)C=CC=C3O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23844',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a furopyran that is 5,6-dihydro-4H-furo[2,3-b]pyran-3(2H)-one which is substituted by hydroxy groups at positions 4 and 5, methyl groups at positions 2 and 5, and a heptyl group at position 6 (the 2R,4R,5S,6R stereoisomer). Isolated from the mangrove fungus Aigialus parvus BCC 5311 and from Phaeoacremonium sp., an endophytic fungus from Senna spectabilis. It has a role as a fungal metabolite and an antifungal agent. It is a furopyran, a tertiary alcohol, a secondary alcohol, a cyclic ketone and a ketene acetal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCC[C@@H]1[C@@]([C@@H](C2=C(O1)O[C@@H](C2=O)C)O)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27364',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a 2-fluorophenylalanine that has L-configuration. It is a 2-fluorophenylalanine and a L-phenylalanine derivative. It is an enantiomer of a 2-fluoro-D-phenylalanine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)C[C@@H](C(=O)O)N)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2214',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an oxo dicarboxylic acid that is (4Z)-hept-4-enedioic acid substituted at position 2 by an oxo group. It is an oxo dicarboxylic acid and an olefinic compound. It is a conjugate acid of a (4Z)-2-oxohept-4-enedioate. It is a tautomer of a (2Z,4Z)-2-hydroxyhepta-2,4-dienedioic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(/C=C\\\\\\\\CC(=O)O)C(=O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14530',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an amino nonasaccharide comprising a sequence of alpha-sialyl, beta-D-galactosyl, N-acetyl-beta-D-glucosaminyl, beta-D-galactosyl, N-acetyl-beta-D-glucosaminyl, beta-D-galactosyl and N-acetyl-beta-D-glucosamine residues connected by (2->6), (1->4), (1->3), (1->4), (1->3), (1->4) and (1->3) linkages respectively, to the reducing-end and proximal N-acetyl-beta-D-glucosaminyl residues of which are also (1->3)-linked alpha-L-fucosyl residues. It has a role as an epitope. It is an amino nonasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O[C@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO[C@@]8(C[C@@H]([C@H]([C@@H](O8)[C@@H]([C@@H](CO)O)O)NC(=O)C)O)C(=O)O)O)O)O)O)NC(=O)C)O)O[C@H]9[C@H]([C@@H]([C@@H]([C@@H](O9)C)O)O)O)NC(=O)C)O)CO)O)NC(=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6747',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an omega-hydroxy-long-chain fatty acid anion that is the conjugate base of 16-hydroxyhexadecanoic acid (also known as 16-hydroxypalmitic acid or juniperic acid). It has a role as a plant metabolite. It derives from a hexadecanoate. It is a conjugate base of a 16-hydroxyhexadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCCCC(=O)[O-])CCCCCCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7747',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a mannopentaose comprising alpha-D-mannopyranose, beta-D-mannopyranose, beta-D-mannopyranose, alpha-D-mannopyranose and D-mannopyranose resideus joined in sequence by (1->2) glycosidic linkages. It derives from a beta-D-Manp-(1->2)-beta-D-Manp-(1->2)-alpha-D-Manp-(1->2)-Manp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O[C@H]2[C@H]([C@@H]([C@H](O[C@H]2O[C@H]3[C@H]([C@@H]([C@H](O[C@H]3O[C@H]4[C@H]([C@@H]([C@H](O[C@@H]4O[C@H]5[C@H]([C@@H]([C@H](OC5O)CO)O)O)CO)O)O)CO)O)O)CO)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28214',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a D-gluconate adduct of indefinite composition containing between 30 and 34% of antimony(V), calculated with reference to dried and methanol-free substance. It is used as a treatment for leishmaniasis. It has a role as an antineoplastic agent and an antileishmanial agent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@@H]1[C@H]2[C@@H](O[Sb](O2)(O1)O[Sb]34O[C@@H]([C@H](O3)[C@@H](CO)O)[C@@H](O4)C(=O)[O-])C(=O)[O-])O)O.O.O.O.O.O.O.O.O.O.O.[OH-].[Na+].[Na+].[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19043',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an ammonium ion resulting from the protonation of two most basic nitrogens of pramipexole. It is a conjugate acid of a pramipexole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC[NH2+][C@H]1CCC2=C(C1)SC(=[NH+]2)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27797',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a dihydroxy monocarboxylic acid that is 17-hydroxymargaric acid (17-hydroxyheptadecanoic acid) in which the pro-R hydrogen beta to the carboxy group is replaced by a hydroxy group. It is a 3-hydroxy carboxylic acid, an omega-hydroxy fatty acid, a dihydroxy monocarboxylic acid and a long-chain fatty acid. It derives from a 17-hydroxymargaric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCCCO)CCCCCC[C@H](CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21988',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organosulfonate oxoanion that is the conjugate base of sulfoacetaldehyde; major species at pH 7.3. It is a conjugate base of a sulfoacetaldehyde.\\nThe corresponding SMILES representation is:\\nC(C=O)S(=O)(=O)[O-]\\nThe natural language question is: The molecule is a thiazolidinemonocarboxylic acid having the carboxy group at the 4-position and four additional methyl substituents at positions 2, 2, 5 and 5. It has a role as an allergen.\\nThe corresponding SMILES representation is:\\nCC1([C@@H](NC(S1)(C)C)C(=O)O)C\\nThe natural language question is: The molecule is an alpha-amino acid that is glycine substituted at the alpha-position by a 2-hydroxyethyl group. It has a role as a metabolite. It is a conjugate acid of a homoserinate.\\nThe corresponding SMILES representation is:\\nC(CO)C(C(=O)O)N\\nThe natural language question is: The molecule is an alkylglucosinolate that is the conjugate base of butylglucosinolic acid. It is a conjugate base of a butylglucosinolic acid.\\nThe corresponding SMILES representation is:\\nCCCC/C(=N/OS(=O)(=O)[O-])/S[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a sulfonium compound that is mercaptoethanol bearing two S-methyl substituents. It is a sulfonium compound, an organic cation and a primary alcohol. It derives from a mercaptoethanol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[S+](C)CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17058',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a stilbenoid isolated from the stems of Kobresia nepalensis and has been shown to exhibit inhibitory activity against topoisomerase II. It has a role as a metabolite and an EC 5.99.1.3 [DNA topoisomerase (ATP-hydrolysing)] inhibitor. It is a stilbenoid, a polyphenol and a member of 1-benzofurans.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1[C@@H]2[C@H]([C@H]([C@H](O2)C3=CC=C(C=C3)O)C4=CC5=C(C=C4O)O[C@H]([C@@H]5C6=CC(=CC(=C6)O)O)C7=CC=C(C=C7)O)C8=CC(=CC(=C8)O)O)O\\nThe natural language question is: The molecule is a 1,1-bis(phosphonic acid) consisting of methane substituted by two phosphonic acid groups. It has a role as a bone density conservation agent and a chelator.\\nThe corresponding SMILES representation is:\\nC(P(=O)(O)O)P(=O)(O)O\\nThe natural language question is: The molecule is an ergostanoid that is (22E)-ergosta-7,22-diene substituted by hydroxy groups at positions 3, 5 and 6 (the 3beta,5alpha,6beta stereoisomer). It has been isolated from the fungus, Xylaria species. It has a role as an Aspergillus metabolite. It is a 3beta-hydroxy steroid, a 5alpha-hydroxy steroid, a 6beta-hydroxy steroid and an ergostanoid.\\nThe corresponding SMILES representation is:\\nC[C@H](/C=C/[C@H](C)C(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=C[C@H]([C@@]4([C@@]3(CC[C@@H](C4)O)C)O)O)C\\nThe natural language question is: The molecule is an organic cation that is the conjugate acid of (R)-laudanosine, obtained by protonation of the tertiary amino group; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a (R)-laudanosine.\\nThe corresponding SMILES representation is:\\nC[NH+]1CCC2=CC(=C(C=C2[C@H]1CC3=CC(=C(C=C3)OC)OC)OC)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 1-acylglycerone 3-phosphate(2-) obtained by deprotonation of the phospho groups of 1-stearoylglycerone 3-phosphate; major species at pH 7.3. It is a conjugate base of a 1-stearoylglycerone 3-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCC(=O)OCC(=O)COP(=O)([O-])[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28424',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a ciguatoxin comprising a sequence of twelve trans-fused six-, seven-, eight- and nine-membered oxacycles and a spiro-fused hydroxytetrahydrofuran ring. It has a role as a metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C[C@H]2[C@@H](C[C@H]3[C@H](O2)[C@H]([C@@H]([C@H]4[C@H](O3)[C@H]([C@@H]([C@]5(O4)C[C@@H](CO5)O)C)C)O)C)O[C@H]6C[C@@H]7[C@]([C@@H](C[C@@H]8[C@@H](O7)C/C=C\\\\\\\\C[C@@H]9[C@@H](O8)/C=C\\\\\\\\C[C@@H]2[C@@H](O9)C=C[C@@H]3[C@@H](O2)C[C@@H]2[C@@H](O3)[C@@H]([C@@H]3[C@@H](O2)CC=CCO3)O)O)(O[C@@H]6C1)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7117',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is the anion formed by deprotonating bromfenac at the carboxyl proton. It is a monocarboxylic acid anion, an organobromine compound, a member of benzophenones and an aromatic amino-acid anion. It is a conjugate base of a bromfenac.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1)C(=O)C2=CC=C(C=C2)Br)N)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28710',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a guanidinium ion resulting from the protonation of the imino nitrogen of 1-dodecylguanidine. The major species at pH 7.3. It is a conjugate acid of a 1-dodecylguanidine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCC[NH+]=C(N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17566',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a zwitterion that is derived from LL-2,6-diaminopimelic acid by deprotonation of both carboxylic acid groups and protonation of both amino groups. It has a role as an Escherichia coli metabolite. It is a tautomer of a LL-2,6-diaminopimelic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C[C@@H](C(=O)[O-])[NH3+])C[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25402',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an indolizidine alkaloid that is 3,12-didehydrogalanthan substituted by hydroxy groups at positions and 2 and a methylenedioxy group across positions 9 and 10. Isolated from Crinum asiaticum, it has been shown to exhibit antimalarial activity. It has a role as a protein synthesis inhibitor, an antimalarial, a plant metabolite and an anticoronaviral agent. It derives from a hydride of a galanthan.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CN2CC3=CC4=C(C=C3[C@H]5[C@H]2C1=C[C@@H]([C@H]5O)O)OCO4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11099',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a gallate ester obtained by formal condensation of the carboxy group of gallic acid with the (3S)-hydroxy group of (+)-epicatechin. It has a role as a metabolite. It is a catechin, a gallate ester and a polyphenol. It derives from a gallic acid and a (+)-epicatechin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H]([C@@H](OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16643',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a tricarboxylic acid triamide resulting from the formal condensation of the each carboxy group or benzene-1,3,5-tricarboxylic acid with the primary amino group of a molecule of 3-aminopyridine. It is a tricarboxylic acid triamide and a secondary carboxamide. It derives from a benzene-1,3,5-tricarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CN=C1)NC(=O)C2=CC(=CC(=C2)C(=O)NC3=CN=CC=C3)C(=O)NC4=CN=CC=C4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2873',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a prostaglandins A. It has a role as a human metabolite. It is a conjugate acid of a prostaglandin A2(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@@H](/C=C/[C@H]1C=CC(=O)[C@@H]1C/C=C\\\\\\\\CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_884',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a polyunsaturated fatty acyl-CoA(4-) obtained by deprotonation of phosphate and diphosphate OH groups of (2E,7Z)-tetradecadienoyl-CoA; major species at pH 7.3. It is a 4,5-saturated-trans-2-enoyl-CoA(4-), a long-chain fatty acyl-CoA(4-) and a polyunsaturated fatty acyl-CoA(4-). It is a conjugate base of a (2E,7Z)-hexadecadienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14201',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a maleate salt obtained by combining equimolar amounts of (R,R)-asenapine and maleic acid. It contains a (R,R)-asenapine(1+). It is an enantiomer of a (S,S)-asenapine maleate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=CC=C24.C(=C\\\\\\\\C(=O)O)\\\\\\\\C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2393',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a monoatomic monocation obtained from potassium. It has a role as a human metabolite and a cofactor. It is an alkali metal cation, an elemental potassium, a monovalent inorganic cation and a monoatomic monocation.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[K+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29065',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a quaternary ammonium ion consisting of L-histidine with a 3-(trimethylammonio)-3-carboxypropyl group at the 2-position of the the imidazole ring. It is a conjugate acid of a diphthinate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[N+](C)(C)[C@@H](CCC1=NC=C(N1)C[C@@H](C(=O)O)N)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15513',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of xanthones that is 2,3,6,8-tetrahydroxyxanthone substituted by a 2-hydroxy-3-methylbut-3-enyl group at position 1. Isolated from the aerial parts of Hypericum scabrum, it exhibits cytotoxicity for human tumour cells. It has a role as a metabolite and an antineoplastic agent. It is a member of xanthones, a polyphenol and a secondary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=C)C(CC1=C(C(=CC2=C1C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3961',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a linear tetrasaccharide consisting of three L-glycero-alpha-D-manno-heptosyl residues and a 3-deoxy-alpha-D-manno-oct-2-ulosonic acid (Kdo) residue linked sequentially (1->2), (1->2) and (1->5); corresponds to the tetrasaccharide epitope from Haemophilus influenzae MAHI 3. It has a role as an epitope.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@H]([C@H](O[C@]1(C(=O)O)O)[C@@H](CO)O)O[C@@H]2[C@H]([C@H]([C@@H]([C@H](O2)[C@H](CO)O)O)O[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)[C@H](CO)O)O)O)O[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)[C@H](CO)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20044',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a dioxo monocarboxylic acid that is caprylic acid with the two oxo groups at the 4- and 7-positions. It has a role as a metabolite. It derives from an octanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)CCC(=O)CCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5018',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH group of trans,octacis-decaprenylphospho-beta-D-ribofuranose; major species at pH 7.3. It is a conjugate base of a trans,octacis-decaprenylphospho-beta-D-ribofuranose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])O[C@H]1[C@@H]([C@@H]([C@H](O1)CO)O)O)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19667',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an 3-oxo-Delta(1),Delta(4)-steroid substituted by an oxo group at position 3 and a beta-hydroxy group at position 17. It is an anabolic androgenic steroid that has been developed for veterinary use. It is a 17beta-hydroxy steroid, an anabolic androgenic steroid and a 3-oxo-Delta(1),Delta(4)-steroid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=CC(=O)C=C[C@]34C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20066',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of tetrahydrothiophenes that is tetrahydrothiophene in which the sulfur has been oxidised to give the corresponding sulfone. A colourless, high-boiling (285℃) liquid that is miscible with both water and hydrocarbons, it is used as an industrial solvent, particularly for the purification of hydrocarbon mixtures by liquid-vapour extraction. It has a role as a polar aprotic solvent. It is a sulfone and a member of tetrahydrothiophenes. It derives from a hydride of a tetrahydrothiophene.\\nThe corresponding SMILES representation is:\\nC1CCS(=O)(=O)C1\\nThe natural language question is: The molecule is the carbohydrate acid derivative anion formed by loss of a proton from the carboxy group of N-acetyl-D-muramoyl-L-alanine; principal microspecies at pH 7.3. It is a hydroxy monocarboxylic acid anion and a carbohydrate acid derivative anion. It is a conjugate base of a N-acetyl-D-muramoyl-L-alanine.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)[O-])NC(=O)[C@@H](C)O[C@H]1[C@@H]([C@H](OC([C@@H]1NC(=O)C)O)CO)O\\nThe natural language question is: The molecule is the O-acetyl derivative of L-homoserine. It has a role as a Saccharomyces cerevisiae metabolite. It is an O-acetylhomoserine and an acetyl-amino acid. It derives from a L-homoserine. It is an enantiomer of an O-acetyl-D-homoserine. It is a tautomer of an O-acetyl-L-homoserine zwitterion.\\nThe corresponding SMILES representation is:\\nCC(=O)OCC[C@@H](C(=O)O)N\\nThe natural language question is: The molecule is an inorganic sodium salt having tungstate as the counterion. Combines with hydrogen peroxide for the oxidation of secondary amines to nitrones. It has a role as a reagent. It contains a tungstate.\\nThe corresponding SMILES representation is:\\n[O-][W](=O)(=O)[O-].[Na+].[Na+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a stilbenoid that is the (2R,3R)-cis-stereoisomer of delta-viniferin, obtained by cyclodimerisation of cis-resveratrol. It is a member of 1-benzofurans, a polyphenol and a stilbenoid. It derives from a cis-resveratrol. It is an enantiomer of a (2S,3S)-cis-delta-viniferin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C\\\\\\\\C2=CC3=C(C=C2)O[C@H]([C@@H]3C4=CC(=CC(=C4)O)O)C5=CC=C(C=C5)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17023',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 3-(4-hydroxy-3-methoxyphenyl)-3-oxopropanoic acid (vanilloylacetic acid). It is a potential intermediate in vanillate synthesis. It derives from a vanilloylacetic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)CC(=O)C4=CC(=C(C=C4)O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9595',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a heme. It has a role as an Escherichia coli metabolite and a cofactor. It is a conjugate acid of a siroheme(8-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]1([C@@H](C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=N5)C=C1[N-]2)CC(=O)O)CCC(=O)O)CCC(=O)O)CC(=O)O)[C@H]([C@]3(C)CC(=O)O)CCC(=O)O)CCC(=O)O)CC(=O)O.[Fe]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20261',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is conjugate base of kaempferide arising from selective deprotonation of the 3-hydroxy group. It is a conjugate base of a kaempferide.\\nThe corresponding SMILES representation is:\\nCOC1=CC=C(C=C1)C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)[O-]\\nThe natural language question is: The molecule is a dioxo monocarboxylic acid consisting of 6-phenylhexanoic acid having the two oxo groups at the 2- and 6-positions. It derives from a hexanoic acid. It is a conjugate acid of a 2,6-dioxo-6-phenylhexanoate.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)C(=O)CCCC(=O)C(=O)O\\nThe natural language question is: The molecule is a primary fatty amide resulting from the formal condensation of the carboxy group of (15Z)-tetracosenoic acid with ammonia. It derives from a (15Z)-tetracosenoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCCCCCCCC(=O)N\\nThe natural language question is: The molecule is a 2-monolysocardiolipinin which the remaining phosphatidyl acyl groups at positions 1 and 1' are specified as linoleoyl while that at position 2' is specified as oleoyl. It derives from a linoleic acid and an oleic acid. It is a conjugate acid of a 1,1'-dilinoleoyl-2-oleoyl monolysocardiolipin(2-).\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)O[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)COP(=O)(O)OCC(COP(=O)(O)OC[C@@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of imidazoles that is 1-(2,4-dichlorophenyl)-2-(imidazol-1-yl)ethanol in which the hydroxyl hydrogen is replaced by a 2,4-dichlorobenzyl group. It is an ether, a member of imidazoles and a dichlorobenzene.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=C(C=C1Cl)Cl)COC(CN2C=CN=C2)C3=C(C=C(C=C3)Cl)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23777',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a pentasaccharide derivative composed of a 4-O-methyl-beta-D-glucuronic acid (at the non-reducing end) and four beta-D-galactose units all joined via (1->6)-linkages.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CO[C@H]1[C@@H]([C@H]([C@@H](O[C@@H]1C(=O)O)OC[C@@H]2[C@@H]([C@@H]([C@H]([C@@H](O2)OC[C@@H]3[C@@H]([C@@H]([C@H]([C@@H](O3)OC[C@@H]4[C@@H]([C@@H]([C@H]([C@@H](O4)OC[C@@H]5[C@@H]([C@@H]([C@H](C(O5)O)O)O)O)O)O)O)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26635',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an amino trisaccharide consisting of beta-L-rhamnose at the reducing end having a 4,6-O-[(1S)-1-carboxyethylidene]-N-acetyl-beta-D-glucosaminyl-(1->3)-alpha-D-galactosyl group attached at the 4-position. It is an amino trisaccharide, a carbohydrate derivative and a cyclic ketal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)O)O)O)O[C@@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@H]3[C@@H]([C@H]([C@@H]4[C@H](O3)CO[C@](O4)(C)C(=O)O)O)NC(=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4397',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a organic cation obtained by protonation of the two tertiary amino functions of vanoxerine It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a vanoxerine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[NH+](CC[NH+]1CCCC2=CC=CC=C2)CCOC(C3=CC=C(C=C3)F)C4=CC=C(C=C4)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19196',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydrochloride obtained by combining amiloride with one molar equivalent of hydrochloric acid. It has a role as a diuretic and a sodium channel blocker. It contains an amiloride(1+).\\nThe corresponding SMILES representation is:\\nC1(=C(N=C(C(=N1)Cl)N)N)C(=O)N=C(N)N.Cl\\nThe natural language question is: The molecule is an alkyl diphosphate having bromohydrin as the alkyl group. It has a role as a phosphoantigen. It is an alkyl diphosphate and an organobromine compound.\\nThe corresponding SMILES representation is:\\nCC(CCOP(=O)(O)OP(=O)(O)O)(CBr)O\\nThe natural language question is: The molecule is a 3',5'-cyclic purine nucleotide that is 3',5'-cyclic GMP in which the hydrogen at position 2 on the purine fragment is replaced by a 6-aminohexylthio group. It is a 3',5'-cyclic purine nucleotide, a ribonucleotide, an aryl sulfide and a primary amino compound. It derives from a 3',5'-cyclic GMP.\\nThe corresponding SMILES representation is:\\nC1[C@@H]2[C@H]([C@H]([C@@H](O2)N3C4=C(C(=O)NC(=N4)N)N=C3SCCCCCCN)O)OP(=O)(O1)O\\nThe natural language question is: The molecule is a tetrahydroisoquinoline alkaloid obtained from a Caribbean tunicate Ecteinascidia turbinata. Used for the treatment of soft tissue sarcoma and relapsed ovarian cancer. It has a role as an antineoplastic agent, a marine metabolite, an anti-inflammatory agent and an angiogenesis modulating agent. It is an organic heteropolycyclic compound, an azaspiro compound, an oxaspiro compound, a bridged compound, a lactone, a polyphenol, an acetate ester, a hemiaminal, an organic sulfide, a tertiary amino compound and an isoquinoline alkaloid.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C([C@@H]3[C@@H]4[C@H]5C6=C(C(=C7C(=C6[C@@H](N4[C@H]([C@H](C2)N3C)O)COC(=O)[C@@]8(CS5)C9=CC(=C(C=C9CCN8)O)OC)OCO7)C)OC(=O)C)C(=C1OC)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is this compound belongs to the class of organic compounds known as beta hydroxy acids and derivatives. These are compounds containing a carboxylic acid substituted with a hydroxyl group on the C3 carbon atom. It is a 3-hydroxy carboxylic acid and a carboxylic ester.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC(=O)C(CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9421',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a hexacyclic triterpenoid with formula C30H46O5, originally isolated from Tripterygium hypoglaucum. It has a role as a plant metabolite. It is a hexacyclic triterpenoid, a gamma-lactone, a monocarboxylic acid and an organic heterohexacyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C(=O)O[C@H]2[C@@]13CC[C@H]4[C@]([C@@H]3CCO2)(CC[C@@]5([C@@]4(CC[C@@]6([C@H]5C[C@](CC6)(C)C(=O)O)C)C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29511',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an amino disaccharide consisting of alpha-L-threo-hex-4-enopyranuronic acid and 2-acetaido-2-deoxy-4,6-di-O-sulfo-beta-D-galactopyranose joined in sequence by a (1->3) glycosidic bond. It is an alpha,beta-unsaturated monocarboxylic acid, an oligosaccharide sulfate, an enol, a member of acetamides and an amino disaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@H]1O)COS(=O)(=O)O)OS(=O)(=O)O)O[C@H]2[C@@H]([C@H](C(=C(O2)C(=O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23204',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an alkylglycerol that is 1-palmityl-2-methyl-sn-glycerol carrying an additional oleoyl substituent at position 3. It is an alkylglycerol and a monoacylglycerol. It derives from an oleic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCOC[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10946',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a linear amino trisaccharide consisting of alpha-KDN, beta-D-galactose and N-acetyl-beta-D-glucosamine residues linked sequentially (2->3) and (1->3). It has a role as an epitope. It is an amino trisaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@@]3(C[C@@H]([C@H]([C@@H](O3)[C@@H]([C@@H](CO)O)O)O)O)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15273',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a steroid acid, the 4-carboxy-4-methyl derivative of zymosterol. It is a 3beta-hydroxy steroid, a steroid acid and a monocarboxylic acid. It derives from a zymosterol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC=C(C)C)[C@H]1CC[C@@H]2[C@@]1(CCC3=C2CC[C@@H]4[C@@]3(CC[C@@H](C4(C)C(=O)O)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22196',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a lipopeptide that is an analogue of human GLP-1 in which the lysine residue at position 27 is replaced by arginine and a hexadecanoyl group attached to the remaining lysine via a glutamic acid spacer. Used as an adjunct to diet and exercise to improve glycemic control in adults with type 2 diabetes mellitus. It has a role as a glucagon-like peptide-1 receptor agonist and a neuroprotective agent. It is a lipopeptide and a polypeptide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)N[C@@H](CCC(=O)NCCCC[C@@H](C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H]([C@@H](C)CC)C(=O)N[C@@H](C)C(=O)N[C@@H](CC2=CNC3=CC=CC=C32)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)NCC(=O)N[C@@H](CCCNC(=N)N)C(=O)NCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)N)NC(=O)CNC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](CC5=CC=CC=C5)NC(=O)[C@H]([C@@H](C)O)NC(=O)CNC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](C)NC(=O)[C@H](CC6=CN=CN6)N)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9965',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an aci-nitro compound resulting from the tautomerisation of the nitro group of 3-nitropropanoic acid. It is a conjugate acid of a 3-(dioxido-lambda(5)-azanylidene)propanoate(2-) and a 3-aci-nitropropanoate. It is a tautomer of a 3-nitropropanoic acid.\\nThe corresponding SMILES representation is:\\nC(/C=[N+](/O)\\\\\\\\[O-])C(=O)O\\nThe natural language question is: The molecule is a tricyclic sesquiterpene, a constituent of the leaf oil cubebene obtained from a variety of species of flowering plant. It has a role as a plant metabolite. It is a sesquiterpene and a carbotricyclic compound.\\nThe corresponding SMILES representation is:\\nC[C@@H]1CC[C@H]([C@H]2[C@]13[C@@H]2C(=C)CC3)C(C)C\\nThe natural language question is: The molecule is a carboxylic ester obtained by formal condensation of cis-3-(2-chloro-3,3,3-trifluoroprop-1-enyl)-2,2-dimethylcyclopropanecarboxylic acid and [(2-methyl-1,1'-biphenyl)-3-yl]methanol. It has a role as a pyrethroid ester insecticide and a pyrethroid ester acaricide. It is an organochlorine compound, an organofluorine compound and a cyclopropanecarboxylate ester. It derives from a cis-chrysanthemic acid.\\nThe corresponding SMILES representation is:\\nCC1=C(C=CC=C1C2=CC=CC=C2)COC(=O)[C@@H]3[C@@H](C3(C)C)/C=C(/C(F)(F)F)\\\\\\\\Cl\\nThe natural language question is: The molecule is a 3-hydroxy fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (R)-3-hydroxybehenic acid. It is a (R)-3-hydroxyacyl-CoA, a 3-hydroxy fatty acyl-CoA, a long-chain fatty acyl-CoA and an 11,12-saturated fatty acyl-CoA. It is a conjugate acid of a (R)-3-hydroxydocosanoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCC[C@H](CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a monocarboxylic acid that is butyric acid in which one of the hydrogens at position 4 is replaced by a 2,4-dichlorophenoxy group. A selective post-emergence herbicide. It has a role as an agrochemical, a synthetic auxin and a phenoxy herbicide. It is an organochlorine compound, a monocarboxylic acid and an aromatic ether. It is a conjugate acid of a 4-(2,4-dichlorophenoxy)butanoate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=C(C=C1Cl)Cl)OCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2008',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a dipeptide obtained by formal condensation of the carboxy group of L-ornithine with the amino group of D-glutamic acid. It is a constituent of bacterial peptidoglycan type A4beta. It derives from a L-ornithine and a D-glutamic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C[C@@H](C(=O)N[C@H](CCC(=O)O)C(=O)O)N)CN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8270',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a dTDP-sugar having 4-dehydro-6-deoxy-alpha-D-glucose as the sugar component. It is an intermediate in dTDP-rhamnose biosynthesis. It has a role as an Escherichia coli metabolite and a mouse metabolite. It derives from a dTDP-D-glucose. It is a conjugate acid of a dTDP-4-dehydro-6-deoxy-alpha-D-glucose(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C(=O)[C@@H]([C@H]([C@H](O1)OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4302',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is dianion of 4-(4-deoxy-beta-D-gluc-4-enosyluronic acid)-D-galacturonic acid arising from deprotonation of both carboxy groups. It is a carbohydrate acid anion and a dicarboxylic acid dianion. It is a conjugate base of a 4-(4-deoxy-beta-D-gluc-4-enosyluronic acid)-D-galacturonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(O[C@H]([C@@H]([C@H]1O)O)O[C@@H]2[C@@H]([C@H](C(O[C@@H]2C(=O)[O-])O)O)O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12015',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 7-[3-(3-cyclohexyl-3-hydroxypropyl)-2,5-dioxoimidazolidin-4-yl]heptanoic acid that is the (3R,4S)-enantiomer of BW 245C. It is an enantiomer of a (3S,4R)-BW 245C.\\nThe corresponding SMILES representation is:\\nC1CCC(CC1)[C@@H](CCN2[C@H](C(=O)NC2=O)CCCCCCC(=O)O)O\\nThe natural language question is: The molecule is a 3-coumarate that is the conjugate base of trans-3-coumaric acid. It has a role as a human xenobiotic metabolite and a plant metabolite. It is a conjugate base of a trans-3-coumaric acid.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC(=C1)[O-])/C=C/C(=O)O\\nThe natural language question is: The molecule is a branched amino octasaccharide comprised of a sequence of alpha-D-mannose, beta-D-mannose, N-acetyl-beta-D-glucosamine and N-acetyl-D-glucosamine residues linked (1->3), (1->4) and (1->4), to the beta-D-mannose of which is (1->6)-linked a tetrasaccharide branch comprising N-acetyl-alpha-neuraminose, beta-D-galactose, N-acetyl-beta-D-glucosamine and alpha-D-mannose residues linked sequentially (2->6), (1->4) and (1->2). It has a role as an epitope. It is an amino octasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H](C[C@@](O[C@H]1[C@@H]([C@@H](CO)O)O)(C(=O)O)OC[C@@H]2[C@@H]([C@@H]([C@H]([C@@H](O2)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O)NC(=O)C)O[C@H]4[C@H]([C@@H]([C@H](O[C@@H]4OC[C@@H]5[C@H]([C@@H]([C@@H]([C@@H](O5)O[C@@H]6[C@H](O[C@H]([C@@H]([C@H]6O)NC(=O)C)O[C@@H]7[C@H](OC([C@@H]([C@H]7O)NC(=O)C)O)CO)CO)O)O[C@@H]8[C@H]([C@H]([C@@H]([C@H](O8)CO)O)O)O)O)CO)O)O)CO)O)O)O)O\\nThe natural language question is: The molecule is an acyl-CoA(4-) that is the tetraanion of benzoyl-CoA, arising from deprotonation of phosphate and diphosphate functions. It is a conjugate base of a benzoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)C4=CC=CC=C4)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 9,10-epoxyoctadecanoate that is the conjugate base of (9S,10R)-epoxyoctadecanoic acid arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a conjugate base of a (9S,10R)-epoxyoctadecanoic acid. It is an enantiomer of a (9R,10S)-9,10-epoxyoctadecanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC[C@@H]1[C@@H](O1)CCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10326',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an L-polyhomomethionine zwitterion obtained by transfer of a proton from the carboxy to the amino group of L-trihomomethionine; major species at pH 7.3. It is a trihomomethionine zwitterion and a L-polyhomomethionine zwitterion. It is a tautomer of a L-trihomomethionine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CSCCCCC[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9125',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 1-(Z)-alk-1-enyl-2-acyl-sn-glycero-3-phosphoethanolamine zwitterion in which the alk-1-enyl and acyl groups are specified as (1Z)-octadecenyl and (4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl respectively. It is a 1-(Z)-alk-1-enyl-2-acyl-sn-glycero-3-phosphoethanolamine zwitterion and a 1-O-(1Z-octadecenyl)-2-acyl-sn-glycero-3-phosphoethanolamine zwitterion. It is a tautomer of a 1-(1Z-octadecenyl)-2-(4Z,7Z,10Z,13Z,16Z,19Z-docosahexaenoyl)-sn-glycero-3-phosphoethanolamine.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC/C=C\\\\\\\\OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC\\nThe natural language question is: The molecule is an oligosaccharide derivative that is a undecasaccharide derivative, the oligosaccharide portion of the Proteus penneri strain 13 lipopolysaccharide (LPS) core region. Structural variations can occur: the alpha-LD-Hep substituent linked (1->7) to alpha-LD-Hep6PEtn may be further substituted (1->7) by a beta-D-GalAN residue; if so, then the [alpha-LD-Hep-(1->2)-alpha-DD-Hep branch linked (1->2) to alpha-D-GalA may be absent.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@H]([C@H](O[C@]1(C(=O)O)O[C@@H]2C[C@@](O[C@@H]([C@@H]2O[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)[C@H](CO)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)[C@H](CO[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)[C@H](CO)O)O)O)O)OP(=O)(O)OCCN)O)O[C@@H]7[C@@H]([C@H]([C@H]([C@H](O7)C(=O)O)O[C@@H]8[C@@H]([C@H]([C@H]([C@H](O8)CO)O)O)N)O)O[C@@H]9[C@H]([C@H]([C@@H]([C@H](O9)[C@@H](CO)O)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@H](CO)O)O)O)O)O)O)[C@@H](CO[C@@H]1[C@@H]([C@H]([C@H](CO1)N)O)O)O)(C(=O)O)O)[C@@H](CO)O)O)O\\nThe natural language question is: The molecule is an organic thiophosphate that is the S-phospho derivative of 2-[(3-aminopropyl)amino]ethanethiol. A prodrug for the free thiol, WR-1065, which is used as a cytoprotectant in cancer chemotherapy and radiotherapy. It has a role as a prodrug, a radiation protective agent and an antioxidant. It is a diamine and an organic thiophosphate. It derives from a cysteamine.\\nThe corresponding SMILES representation is:\\nC(CN)CNCCSP(=O)(O)O\\nThe natural language question is: The molecule is a nucleoside 5'-diphosphate(3-) arising from deprotonation of all three OH groups of the diphosphate function of of inosine 5'-diphosphate (IDP); major species at pH 7.3. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of an IDP.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(C(=O)N1)N=CN2[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])[O-])O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a nucleotide-sugar oxoanion arising from deprotonation of the diphosphate OH groups and protonation of the amino group of dTDP-4-amino-2,3,4,6-tetradeoxy-alpha-D-glucose; major species at pH 7.3. It is a conjugate base of a dTDP-4-amino-2,3,4,6-tetradeoxy-alpha-D-glucose.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C[C@@H]1[C@H](CC[C@H](O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22977',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a sesquiterpenoid that consists of (3S,4R,4aR,6S,6aS,12R,12aS,12bS)-4-(acetoxymethyl)-12-hydroxy-4,6a,12b-trimethyl-11-oxo-9-(pyridin-3-yl)-1,3,4,4a,5,6,6a,12,12a,12b-decahydro-2H,11H-benzo[f]pyrano[4,3-b]chromene-3,6-diol in which the hydrogens of the 3- and 6-hydroxy functions are substituted by acetyl groups. It has a role as a metabolite and an acyl-CoA:cholesterol acyltransferase 2 inhibitor. It is a sesquiterpenoid and an organic heterotetracyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)OC[C@@]1([C@H](CC[C@]2([C@H]1C[C@@H]([C@@]3([C@@H]2[C@H](C4=C(O3)C=C(OC4=O)C5=CN=CC=C5)O)C)OC(=O)C)C)OC(=O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23927',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a triacylglycerol 52:3 in which the acyl groups at positions 1, 2 and 3 are specified as palmitoyl, linoleoyl and oleoyl respectively. It has a role as a Caenorhabditis elegans metabolite and a mouse metabolite. It is a triacyl-sn-glycerol, a triacylglycerol 52:3 and a linoleoyl containing 1,2,3-triacyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OC[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nThe natural language question is: The molecule is a 9,10-epoxyoctadecanoate that is the conjugate base of (9S,10R)-epoxyoctadecanoic acid arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a conjugate base of a (9S,10R)-epoxyoctadecanoic acid. It is an enantiomer of a (9R,10S)-9,10-epoxyoctadecanoate.\\nThe corresponding SMILES representation is:\\nCCCCCCCC[C@@H]1[C@@H](O1)CCCCCCCC(=O)[O-]\\nThe natural language question is: The molecule is an octanoate ester obtained by the formal condensation of the carboxy group of octanoic acid with the hydroxy group of butanol. It has a role as a metabolite. It derives from a butan-1-ol.\\nThe corresponding SMILES representation is:\\nCCCCCCCC(=O)OCCCC\\nThe natural language question is: The molecule is an (omega-1)-hydroxy fatty acid ascaroside obtained by formal condensation of the alcoholic hydroxy group of (2E,6R)-6-hydroxyhept-2-enoic acid with ascarylopyranose (the alpha anomer). A metabolite of the nematode Caenorhabditis elegans, it is weakly dauer inducing and a weak male-attractant. It has a role as a Caenorhabditis elegans metabolite and a pheromone. It is an alpha,beta-unsaturated monocarboxylic acid and an (omega-1)-hydroxy fatty acid ascaroside. It derives from a (2E,6R)-6-hydroxyhept-2-enoic acid. It is a conjugate acid of an ascr#7(1-).\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CC/C=C/C(=O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydrate composed of cadmium sulfate and water in a 3:8 ratio. It is a hydrate and a cadmium salt. It contains a cadmium sulfate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'O.O.O.O.O.O.O.O.[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Cd+2].[Cd+2].[Cd+2]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17896',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is the carbohydrate acid derivative anion formed from beta-D-GlcA3S-(1->3)-beta-D-Gal-OC6H4-4-[CH2]2NHC(O)[CH2]3SH by loss of two protons, one from each of its sulfo and carboxy groups. It is a conjugate base of a beta-D-GlcA3S-(1->3)-beta-D-Gal-OC6H4-4-[CH2]2NHC(O)[CH2]3SH.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1CCNC(=O)CCCS)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)[O-])O)OS(=O)(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9279',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a cyclic dipeptide that is brevianamide F (cyclo-L-Trp-L-Pro) substituted at position 2 on the indole ring by a 1,1-dimethylallyl group. It is a dipeptide, a member of indoles, a pyrrolopyrazine and an indole alkaloid. It derives from a brevianamide F.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C=C)C1=C(C2=CC=CC=C2N1)C[C@H]3C(=O)N4CCC[C@H]4C(=O)N3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28859',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a resin glycoside that is the pentasaccharide derivative of jalapinolic acid. Isolated from the aerial parts of Ipomoea pes-caprae, it has been found to exhibit potential inhibitory effect against multidrug resistance in the human breast cancer cell line. It has a role as a metabolite. It is a cinnamate ester, a macrocyclic lactone, a pentasaccharide derivative, a resin glycoside and a decanoate ester. It derives from a (S)-2-methylbutyric acid, a trans-cinnamic acid and a jalapinolic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCC(=O)O[C@@H]1[C@@H]([C@H]([C@@H](O[C@H]1O[C@H]2[C@@H](O[C@@H]3[C@@H]([C@@H]2OC(=O)CCCCCCCCC[C@@H](O[C@H]4[C@H](O3)[C@H]([C@H]([C@H](O4)C)O)O)CCCCC)O)C)C)O[C@H]5[C@@H]([C@@H]([C@H]([C@@H](O5)C)OC(=O)[C@@H](C)CC)OC(=O)/C=C/C6=CC=CC=C6)O)O[C@H]7[C@@H]([C@@H]([C@H]([C@@H](O7)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14437',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an aromatic ketone in which the two substituents on the carbonyl C atom are phenyl and ethyl. It has a role as a fragrance.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(=O)C1=CC=CC=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20345',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organofluorine compound that is benzyl alcohol substituted by fluoro groups at positions 2, 3, 4, 5 and 6. It is a member of benzyl alcohols and an organofluorine compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C1=C(C(=C(C(=C1F)F)F)F)F)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1789',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a homoisoflavonoid that is 4H-1-benzopyran-4-one substituted by hydroxy groups at positions 5 and 7, methyl groups at positions 6 and 8 and a (2H-1,3-benzodioxol-5-yl)methyl group at position 3 respectively. It has a role as a plant metabolite. It is a homoisoflavonoid and a member of resorcinols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=C2C(=C1O)C(=O)C(=CO2)CC3=CC4=C(C=C3)OCO4)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26138',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a glycosylmannose that is beta-D-mannopyranose in which the hydroxy group at position 3 has been converted into the corresponding alpha-D-mannopyranoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@@H](O1)O)O)O[C@@H]2[C@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13525',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 2-acyl-sn-glycero-3-phosphoethanolamine in which the acyl group is specified as (11Z,14Z)-icosadienoyl. It has a role as a mouse metabolite. It is a 2-acyl-sn-glycero-3-phosphoethanolamine and a lysophosphatidylethanolamine 20:2. It derives from an (11Z,14Z)-icosadienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCC(=O)O[C@H](CO)COP(=O)(O)OCCN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22123',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a pseudoguaianolide with anti-inflammatory activity isolated from the aerial parts of Inula hupehensis. It has a role as an anti-inflammatory agent and a plant metabolite. It is a gamma-lactone, an acetate ester, a cyclic ketone, an ether, an organic heterotricyclic compound and a pseudoguaianolide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C[C@H]2[C@H]([C@@H]([C@]3([C@H]1[C@H](CC3=O)OC)C)OC(=O)C)C(=C)C(=O)O2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26597',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a macromolecule consisting of 10-formyltetrahydrofolic acid with an arbitrary number of glutamate residues attached as a polypeptide to the single existent one. It derives from a 10-formyltetrahydrofolic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H](NC2=C(N1)N=C(NC2=O)N)CN(C=O)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)N[C@@H](CCC(=O)N[C@@H](CCC(=O)O)C(=O)O)C(=O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14858',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a HETE anion that is the conjugate base of 7-HETE, obtained by deprotonation of the carboxy group; major species at pH 7.3. It derives from an arachidonate. It is a conjugate base of a 7-HETE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C(/C=C\\\\\\\\CCCC(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12432',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a aldonolactone phosphate that is L-arabino-1,4-lactone carrying a single phospho substituent at position 5. It is a gamma-lactone and an aldonolactone phosphate. It derives from a L-arabinono-1,4-lactone. It is a conjugate acid of a L-arabino-1,4-lactone-5-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]1[C@@H]([C@H](C(=O)O1)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22713',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a fatty acid methyl ester resulting from the formal condensation of the carboxy group of 12-HPETE with methanol. It is a fatty acid methyl ester and a lipid hydroperoxide. It derives from an icosa-5,9,11,14-tetraenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C(C/C=C\\\\\\\\CCCC(=O)OC)OO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15995',\n", + " 'prompt': \"Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a steroid sulfate oxoanion obtained by deprotonation of the sulfo group of 17beta-estradiol 3-sulfate; major species at pH 7.3. It is a conjugate base of a 17beta-estradiol 3-sulfate.\\nThe corresponding SMILES representation is:\\nC[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=C3C=CC(=C4)OS(=O)(=O)[O-]\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of 3-phenylpropionic acid, obtained by deprotonation of the carboxy group. It has a role as a human metabolite and a plant metabolite. It is a conjugate base of a 3-phenylpropionic acid.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)CCC(=O)[O-]\\nThe natural language question is: The molecule is a docosanoid that is (4Z,8E,10Z,13Z,15E,19Z)-docosahexaenoic acid carrying two hydroperoxy substituents at the 7S- and 17S-positions. It has a role as a human xenobiotic metabolite. It is a docosanoid, a hydroperoxy fatty acid, a lipid hydroperoxide and a long-chain fatty acid. It derives from an all-cis-docosa-4,7,10,13,16,19-hexaenoic acid. It is a conjugate acid of a (7S,17S)-bis(hydroperoxy)-(4Z,8E,10Z,13Z,15E,19Z)-docosahexaenoate.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C[C@@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\[C@H](C/C=C\\\\\\\\CCC(=O)O)OO)OO\\nThe natural language question is: The molecule is a peptide anion arising from deprotonation of the three carboxy groups and protonation of the primary amino group of D-gamma-glutamyl-D-glutamic acid. It is a conjugate base of a D-gamma-glutamyl-D-glutamic acid.\\nThe corresponding SMILES representation is:\\nC(CC(=O)N[C@H](CCC(=O)[O-])C(=O)[O-])[C@H](C(=O)[O-])[NH3+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a xanthene dye that is fluorescein bearing bromine substituents at positions 2', 4', 5' and 7' (on the xanthene ring) and chlorine substituents at position 2, 3, 4, and 5 (on the phenyl ring). The disodium salt is the biological stain 'phloxine B'. It has a role as a fluorochrome. It is an organobromine compound, a member of benzoic acids, a tetrachlorobenzene, a member of phenols and a xanthene dye. It derives from a fluorescein. It is a conjugate acid of a 2',4',5',7'-tetrabromo-2,3,4,5-tetrachlorofluorescein(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=C2C(=C3C=C(C(=O)C(=C3OC2=C(C(=C1Br)O)Br)Br)Br)C4=C(C(=C(C(=C4Cl)Cl)Cl)Cl)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16005',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the indol-3-yl carboxylic acid anion formed by loss of a proton from the carboxy group of (5-hydroxyindol-3-yl)acetic acid; principal microspecies at pH 7.3 It has a role as a human metabolite. It is a conjugate base of a (5-hydroxyindol-3-yl)acetic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1O)C(=CN2)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25632',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 3beta-hydroxy steroid that is estr-4-ene substituted by a beta-hydroxy group at positions 3 and 17. It is a synthetic anabolic steroid that is used as a dietary supplement by athletes to enhance performance. It has a role as a nutraceutical and a prohormone. It is a 17beta-hydroxy steroid, an anabolic androgenic steroid, a 3beta-hydroxy steroid and a diol. It derives from a hydride of an estrane.\\nThe corresponding SMILES representation is:\\nC[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=C[C@H](CC[C@H]34)O\\nThe natural language question is: The molecule is a deoxytalose that is beta-L-talopyranose in which the hydroxy group at position 6 has been replaced by a hydrogen.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@H]([C@H](O1)O)O)O)O\\nThe natural language question is: The molecule is the L-enantiomer N-carbamoylaspartic acid. It has a role as a metabolite. It is a N-carbamoyl-L-amino acid, a N-carbamoylaspartic acid and a L-aspartic acid derivative. It is a conjugate acid of a N-carbamoyl-L-aspartate(2-).\\nThe corresponding SMILES representation is:\\nC([C@@H](C(=O)O)NC(=O)N)C(=O)O\\nThe natural language question is: The molecule is an N-acyl-15-methylhexadecasphing-4-enine-1-phosphocholine in which the acyl group has 26 carbons and 0 double bonds and is 2-hydroxylated. It derives from a 15-methylhexadecasphing-4-enine.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCC(C(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCCC(C)C)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a DiHETE that is (6E,8Z,11Z,13E)-icosatetraenoic acid in which the two hydroxy substituents are placed at the 5S- and 15R-positions. It has a role as a human xenobiotic metabolite. It is a conjugate acid of a 5(S),15(R)-DiHETE(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\[C@H](CCCC(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9566',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a hydroxy fatty acid anion that is the conjugate base of (9Z,12R)-12,18-dihydroxyoctadecenoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is an omega-hydroxy fatty acid anion, a long-chain fatty acid anion and a hydroxy monounsaturated fatty acid anion. It is a conjugate base of a (9Z,12R)-12,18-dihydroxyoctadecenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCC/C=C\\\\\\\\C[C@@H](CCCCCCO)O)CCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15524',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a dicarboxylic acid anion obtained by deprotonation of the two carboxy groups and the 7-hydroxy group of salvianin. It is a conjugate base of a salvianin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C2=C(C=C3C(=CC(=O)C=C3O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)COC(=O)CC(=O)[O-])OC(=O)CC(=O)[O-])O)O)O2)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)COC(=O)/C=C/C6=CC(=C(C=C6)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29072',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a heptasaccharide consisting of two alpha-maltotriose units linked (1->6), with a further alpha-D-glucose unit linked to O-6 of the glucose residue at the non-reducing end. It derives from an alpha-maltotriose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@H](O2)O[C@@H]3[C@H](O[C@@H]([C@@H]([C@H]3O)O)O[C@@H]4[C@H](O[C@@H]([C@@H]([C@H]4O)O)OC[C@@H]5[C@H]([C@@H]([C@H]([C@H](O5)O[C@@H]6[C@H](O[C@@H]([C@@H]([C@H]6O)O)O[C@@H]7[C@H](O[C@@H]([C@@H]([C@H]7O)O)O)CO)CO)O)O)O)CO)CO)O)O)O)O)O)O)O\\nThe natural language question is: The molecule is an ammonium ion resulting from the protonation of the non-acylated nitrogen of N(1),N(8)-bis(sinapoyl)-spermidine. The major species at pH 7.3. It has a role as a plant metabolite. It is a conjugate acid of a N(1),N(8)-bis(sinapoyl)-spermidine.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1O)OC)/C=C/C(=O)NCCCC[NH2+]CCCNC(=O)/C=C/C2=CC(=C(C(=C2)OC)O)OC\\nThe natural language question is: The molecule is a one-carbon compound that is methane in which three of the hydrogens are replaced by chlorines. It has a role as an inhalation anaesthetic, a non-polar solvent, a carcinogenic agent, a central nervous system drug and a refrigerant. It is a one-carbon compound and a member of chloromethanes.\\nThe corresponding SMILES representation is:\\nC(Cl)(Cl)Cl\\nThe natural language question is: The molecule is a myo-inositol monophosphate. It has a role as a mouse metabolite. It derives from a myo-inositol. It is a conjugate acid of a 1D-myo-inositol 3-phosphate(2-).\\nThe corresponding SMILES representation is:\\n[C@H]1([C@@H](C([C@H]([C@@H](C1O)O)O)OP(=O)(O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a scyllo-inositol phosphate having the phosphate at the 4-position and a guanidino group in place of the hydroxyl at position 1. It is a conjugate acid of a 1-guanidino-1-deoxy-scyllo-inositol 4-phosphate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@H]1([C@H](C([C@H]([C@@H](C1N=C(N)N)O)O)OP(=O)(O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5112',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an indole alkaloid isolated from the Amaryllidaceae family and has been shown to exhibit cytotoxic activity. It has a role as an antineoplastic agent and a metabolite. It is an indole alkaloid, a delta-lactone, a secondary alcohol and an organic heteropentacyclic compound.\\nThe corresponding SMILES representation is:\\nCN1CCC2=C[C@@H]([C@@H]3[C@H]([C@@H]21)C4=CC5=C(C=C4C(=O)O3)OCO5)O\\nThe natural language question is: The molecule is a tRNA oligonucleotide comprised of a sequence of inosine, three adenosine, guanosine, cytidine, 1-methylinosine and uridine residues connected by 3'->5' phosphodiester linkages and with a phosphoric residue at the 3'-terminus.\\nThe corresponding SMILES representation is:\\nCN1C=NC2=C(C1=O)N=CN2[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)O[C@@H]4[C@H](O[C@H]([C@@H]4O)N5C=CC(=NC5=O)N)COP(=O)(O)O[C@@H]6[C@H](O[C@H]([C@@H]6O)N7C=NC8=C7N=C(NC8=O)N)COP(=O)(O)O[C@@H]9[C@H](O[C@H]([C@@H]9O)N1C=NC2=C(N=CN=C21)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C(N=CN=C21)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C(N=CN=C21)N)COP(=O)(O)O[C@@H]1[C@H](O[C@H]([C@@H]1O)N1C=NC2=C1N=CN=C2O)CO)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N1C=CC(=O)NC1=O)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is an ammonium salt obtained by reaction of ammonia with acetic acid. A deliquescent white crystalline solid, it has a relatively low melting point (114℃) for a salt. Used as a food acidity regulator, although no longer approved for this purpose in the EU. It has a role as a food acidity regulator and a buffer. It is an acetate salt and an ammonium salt.\\nThe corresponding SMILES representation is:\\nCC(=O)[O-].[NH4+]\\nThe natural language question is: The molecule is a mycolic acid produced by Mycobacterium tuberculosis, the structure of which is tetracosanoic acid substituted at the alpha-carbon by a C55 chain which incorporates a hydroxy group at C-1, a cyclopropyl ring fused onto the C-16-C-17 bond, an oxo group at C-35 and a methyl group at C-36. It has a role as an antigen.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCC(C(CCCCCCCCCCCCCCC1CC1CCCCCCCCCCCCCCCCCC(=O)C(C)CCCCCCCCCCCCCCCCCCC)O)C(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is pyrimidine substituted at C-2 by a sulfanyl group. It has a role as a corrosion inhibitor and an allergen. It is a member of pyrimidines and an aryl thiol.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CNC(=S)N=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6466',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a branched N-glycan derivative that is an undecasaccharide derivative consisting of nine D-mannosyl residues and two N-acetylglucosamine residues (one at the reducing end). It is a N-glycan derivative, a high-mannose oligosaccharide and a polysaccharide derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@@H]2[C@H](OC([C@@H]([C@H]2O)NC(=O)C)O)CO)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)O)O[C@@H]7[C@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O[C@@H]8[C@H]([C@H]([C@@H]([C@H](O8)CO)O)O)O)O)O)O[C@@H]9[C@H]([C@H]([C@@H]([C@H](O9)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10251',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a lignan that is 2,3-diemthylbutane substituted by a 4-hydroxy-3-methoxyphenyl group at position 4 and a 4-hydroxy-3,5-dimethoxyphenyl group at position 1. It has been isolated from the bark of Machilus robusta. It has a role as a plant metabolite. It is a dimethoxybenzene, a member of phenols and a lignan.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](CC1=CC(=C(C=C1)O)OC)[C@H](C)CC2=CC(=C(C(=C2)OC)O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15474',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of pyrimidines that is pyrimidine which has been substituted at positions 2, 4, and 5 by (1-isopropyl-2-oxo-2,3-dihydro-1H-indol-5-yl)amino, [3-(methylsulfonyl)benzyl]amino, and trifluoromethyl groups, respectively. It is a sulfone, a secondary amino compound, a member of pyrimidines, a member of oxindoles and a member of (trifluoromethyl)benzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)N1C(=O)CC2=C1C=CC(=C2)NC3=NC=C(C(=N3)NCC4=CC(=CC=C4)S(=O)(=O)C)C(F)(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4329',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an amino disaccharide that consists of 6-sulfated N-acetyl-beta-D-glucosamine having a 3-sulfated beta-D-galactosyl residue attached at position 4. It has a role as an epitope. It is an oligosaccharide sulfate and an amino disaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)COS(=O)(=O)O)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)OS(=O)(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13175',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a long-chain fatty acid ethyl ester resulting from the formal condensation of the carboxy group of (4Z,7Z,10Z,13Z,16Z)-henicosapentaenoic acid with the hydroxy group of ethanol.\\nThe corresponding SMILES representation is:\\nCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)OCC\\nThe natural language question is: The molecule is a member of the class of carbazoles that is an adrenergic antagonist with non-selective beta- and alpha-1 receptor blocking properties which helps in the management of congestive heart failure. It has a role as a beta-adrenergic antagonist, an antihypertensive agent, an alpha-adrenergic antagonist, a vasodilator agent and a cardiovascular drug. It is a member of carbazoles, a secondary alcohol and a secondary amino compound.\\nThe corresponding SMILES representation is:\\nCOC1=CC=CC=C1OCCNCC(COC2=CC=CC3=C2C4=CC=CC=C4N3)O\\nThe natural language question is: The molecule is a 3-hydroxy fatty acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (3R,9Z,12Z,15Z,18Z,21Z)-3-hydroxytetracosapentaenoyl-CoA; major species at pH 7.3. It is a (R)-3-hydroxyacyl-CoA(4-) and a 3-hydroxy fatty acyl-CoA(4-). It is a conjugate base of a (3R,9Z,12Z,15Z,18Z,21Z)-3-hydroxytetracosapentaenoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC[C@H](CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)O\\nThe natural language question is: The molecule is a cyclodepsipeptide isolated from Jaspis splendens. A derivative of jaspamide, it has been shown to exhibit cytotoxic and microfilament disruption activity. It has a role as a metabolite, an actin polymerisation inhibitor and an antineoplastic agent. It is a cyclodepsipeptide, a macrocycle and a substituted aniline.\\nThe corresponding SMILES representation is:\\nC[C@@H]\\\\\\\\1C[C@@H](OC(=O)C[C@@H](NC(=O)[C@H](N(C(=O)[C@@H](NC(=O)[C@H](C/C(=C1)/C)C)C)C)CC(=O)C2=CC=CC=C2N)C3=CC=C(C=C3)O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a fluoroalkane that is hexadecane in which all of the hydrogen atoms at positions 1, 2, 3, 4, 5, and 6 have been replaced by fluorine atoms. It has a role as a nonionic surfactant. It is a fluorohydrocarbon and a fluoroalkane. It derives from a hydride of a hexadecane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5530',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a dicarboximide that is 3,4,5,6-tetrahydrophthalimide in which the hydrogen attached to the nitrogen is replaced by a p-bromophenyl group. It is a secondary amide, an organonitrogen heterocyclic compound, a member of maleimides, a member of bromobenzenes and an organic heterobicyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CCC2=C(C1)C(=O)N(C2=O)C3=CC=C(C=C3)Br'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24327',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of 2,5-diketopiperazines that is piperazine-2,5-dione in which two hydrogen at position 3 and one hydrogen at position 6 are replaced by benzylidene and isobutyl groups respectively.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C[C@H]1C(=O)N/C(=C\\\\\\\\C2=CC=CC=C2)/C(=O)N1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12265',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a steroid lactone that has been used in the treatment of preeclampsia. A synthetic bufadienolide it is an antagonist of marinobufagenin (MBG) and differs from it by replacement of its 5beta-OH and 13beta-Me groups by hydrogen. It is a steroid lactone and an epoxy steroid. It derives from a bufanolide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@@H](C[C@H]1CC[C@@H]3[C@@H]2CC[C@H]4[C@]35[C@H](O5)C[C@@H]4C6=COC(=O)C=C6)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6313',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an inorganic sodium salt and a sulfite salt. It has a role as a mutagen. It contains a hydrogensulfite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'OS(=O)[O-].[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27702',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a nucleoside triphosphate(4-) obtained by global deprotonation of the triphosphate OH groups of XTP; major species present at pH 7.3. It is a conjugate base of a XTP(3-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])O)O)NC(=O)NC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8850',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an amino acid zwitterion obtained by transfer of a proton from the carboxy to the amino group of tetrahydrotyrosine; major species at pH 7.3. It is a tautomer of a tetrahydrotyrosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC(C=CC1CC(C(=O)[O-])[NH3+])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25257',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a dihydroxydocosahexaenoic acid that is (4Z,7Z,11E,13Z,15E,19Z)-docosahexaenoic acid in which the two hydroxy substituents are located at positions 10 and 17 (the 10S,17S-stereoisomer). A natural isomer of protectin D1, one of the specialised proresolving mediators. It has a role as an anti-inflammatory agent and a human xenobiotic metabolite. It is a dihydroxydocosahexaenoic acid and a secondary allylic alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C[C@@H](/C=C/C=C\\\\\\\\C=C\\\\\\\\[C@H](C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19944',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a monocarboxylic acid that is 2-methylpropanoic acid substituted by a 4-(4-chlorobenzoyl)phenoxy group at position 2. It is a metabolite of the drug fenofibrate. It has a role as a marine xenobiotic metabolite and a drug metabolite. It is a chlorobenzophenone, a monocarboxylic acid and an aromatic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C(=O)O)OC1=CC=C(C=C1)C(=O)C2=CC=C(C=C2)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1151',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a 2-monolysocardiolipinin which the remaining phosphatidyl acyl groups at positions 1 and 1' are specified as linoleoyl while that at position 2' is specified as oleoyl. It derives from a linoleic acid and an oleic acid. It is a conjugate acid of a 1,1'-dilinoleoyl-2-oleoyl monolysocardiolipin(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)O[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)COP(=O)(O)OCC(COP(=O)(O)OC[C@@H](COC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27004',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an oligonucleotide comprised of three adenosine residues linked 2'->5' and with a triphosphate group at the 5' terminus. It has a role as a protein synthesis inhibitor.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])O[C@@H]4[C@@H]([C@H](O[C@H]4N5C=NC6=C(N=CN=C65)N)COP(=O)([O-])O[C@@H]7[C@@H]([C@H](O[C@H]7N8C=NC9=C(N=CN=C98)N)COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])O)O)O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19370',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a docosanoid that is (8E,10E,12Z,16Z,19Z)-docosapentaenoic acid carrying two hydroxy substituents at positions 7R and 14S. An intermediate of specialised proresolving mediators. It has a role as an anti-inflammatory agent and a human xenobiotic metabolite. It is a secondary allylic alcohol, a docosanoid, a hydroxy polyunsaturated fatty acid and a long-chain fatty acid. It is a conjugate acid of a (7R,14S)-dihydroxy-(8E,10E,12Z,16Z,19Z)-docosapentaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C[C@@H](/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\[C@@H](CCCCCC(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5233',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a dicarboxylic acid dianion resulting from the removal of a proton from both of the carboxy groups of heme d trans-diol. It is a conjugate base of a heme d trans-diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)[C@@]([C@]3(CCC(=O)[O-])O)(C)O)CCC(=O)[O-].[Fe]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6533',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 1-alkyl-2-acyl-sn-glycero-3-phosphserine in which the alkyl and acyl groups are specified as (1Z)-octadecenyl and oleoyl respectively. It derives from an oleic acid. It is a conjugate acid of a 1-(1Z-octadecenyl)-2-oleoyl-sn-glycero-3-phosphoserine(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC/C=C\\\\\\\\OC[C@H](COP(=O)(O)OC[C@@H](C(=O)O)N)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7354',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a hydroxybenzyl alcohol that is phenol substituted at position C-3 by a hydroxymethyl group. It has a role as a metabolite. It is a member of phenols and a hydroxybenzyl alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC(=C1)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4578',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an (R)-3-hydroxyacyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (R)-3-hydroxyoctanoyl-CoA. It is a (R)-3-hydroxyacyl-CoA(4-) and a 3-hydroxyoctanoyl-CoA(4-). It is a conjugate base of a (R)-3-hydroxyoctanoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@H](CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11988',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a dipeptide formed from two L-leucine residues. It has a role as a human metabolite and a Mycoplasma genitalium metabolite. It derives from a L-leucine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C[C@@H](C(=O)N[C@@H](CC(C)C)C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5366',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is the dolichyl diphosphooligosaccharide(2-) species that is the dianion formed by loss of protons from the diphospho linkage in beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc(PP-Dol); major microspecies at pH 7.3. It is a conjugate base of a beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc(PP-Dol).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CC/C=C(/C)\\\\\\\\CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)CCOP(=O)([O-])OP(=O)([O-])OC1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)NC(=O)C)O)NC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26914',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is dianion of UDP-N-acetyl-D-galactosamine arising from deprotonation of the diphosphate OH groups; major species at pH 7.3. It has a role as a human metabolite. It is a conjugate base of an UDP-N-acetyl-D-galactosamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](OC1OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21453',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a butirosin that is butirosin B in which a gamma-L-glutamyl is attached to the amino group of the (S)-4-amino-2-hydroxybutyrate side-chain. It has a role as an antimicrobial agent. It derives from a neamine. It is a conjugate base of a gamma-L-glutamylbutirosin B(3+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CCNC(=O)CC[C@@H](C(=O)O)N)O)O)O[C@H]2[C@@H]([C@@H]([C@H](O2)CO)O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CN)O)O)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7479',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a tetrahydroxyflavone that is kaempferol substituted by a lavandulyl group at position 8. Isolated from Sophora flavescens, it exhibits antioxidant activity. It has a role as a metabolite, a radical scavenger and an EC 1.1.1.21 (aldehyde reductase) inhibitor. It is a member of flavonols and a tetrahydroxyflavone. It derives from a kaempferol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC(CC1=C2C(=C(C=C1O)O)C(=O)C(=C(O2)C3=CC=C(C=C3)O)O)C(=C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22985',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a keto-D-gluconate that is the conjugate base of 3-dehydro-D-gluconic acid, obtained by deprotonation of the carboxy group. It is a keto-D-gluconate and a 3-oxo monocarboxylic acid anion. It is a conjugate base of a 3-dehydro-D-gluconic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@H](C(=O)[C@H](C(=O)[O-])O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_432',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphonate OH and three of the four triphosphate OH groups of alpha-D-ribose 1-methylphosphonate 5-triphosphate. It is an organophosphate oxoanion and an organophosphonate oxoanion. It is a conjugate base of an alpha-D-ribose 1-methylphosphonate 5-triphosphate. It is a conjugate acid of an alpha-D-ribose 1-methylphosphonate 5-triphosphate(5-).\\nThe corresponding SMILES representation is:\\nCP(=O)([O-])O[C@@H]1[C@@H]([C@@H]([C@H](O1)COP(=O)([O-])OP(=O)([O-])OP(=O)(O)[O-])O)O\\nThe natural language question is: The molecule is a para-terphenyl that consists of 1,4-diphenylbenzene substituted by acetyloxy group at position 3', hydroxy groups at positions 4 and 4'' a (3-hydroxybutanoyl)oxy group at position 6' and [3-(acetyloxy)butanoyl]oxy groups at positions 2' and 5' respectively. It is isolated from the fruit body of the mushroom Paxillus curtisii and exhibits radical scavenging activity. It has a role as a metabolite and a radical scavenger. It is an acetate ester, a member of phenols and a para-terphenyl. It derives from a 3-hydroxybutyric acid. It derives from a hydride of a 1,4-diphenylbenzene.\\nThe corresponding SMILES representation is:\\nCC(CC(=O)OC1=C(C(=C(C(=C1C2=CC=C(C=C2)O)OC(=O)CC(C)OC(=O)C)OC(=O)C)C3=CC=C(C=C3)O)OC(=O)CC(C)OC(=O)C)O\\nThe natural language question is: The molecule is a cyclic ketone that is cyclohexa-2,5-dien-1-one substituted by a methoxy group at position 5, a 2-oxopropyl group at position 4, a hydroxy group at position 4 and a prop-2-en-1-yl group at position 2 which in turn is substituted by s a 3-hydroxy-4-methoxyphenyl group at position 1. It has been isolated from Pterocarpus santalinus. It has a role as a metabolite and a plant metabolite. It is an aromatic ether, a member of phenols and a cyclic ketone.\\nThe corresponding SMILES representation is:\\nCC(=O)CC1(C=C(C(=O)C=C1OC)[C@@H](C=C)C2=CC(=C(C=C2)OC)O)O\\nThe natural language question is: The molecule is a trialkyl phosphate, an organophosphate insecticide and an organochlorine compound. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor, an acaricide and an agrochemical. It derives from a hydride of a bicyclo[3.2.0]hepta-2,6-diene.\\nThe corresponding SMILES representation is:\\nCOP(=O)(OC)OC1=C(C2C1CC=C2)Cl\\nNext, you will be given a sample for test.The natural language question is: The molecule is a furanocoumarin that is 2H-furo[2,3-h]chromen-2-one substituted by a hydroxy group at position 5, 2-hydroxypropan-2-yl group at position 8, a 2-methylpropanoyl group at position 6 and a phenyl group at position 4. Isolated from the bark of Ochrocarpos punctatus, it exhibits cytotoxicity against the A2780 ovarian cancer cell line. It has a role as a metabolite and an antineoplastic agent. It is a furanocoumarin, a member of phenols and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(C)C(=O)C1=C2C(=C3C(=C1O)C(=CC(=O)O3)C4=CC=CC=C4)C=C(O2)C(C)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8391',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a glucotriose consisting of two beta-D-glucopyranose residues and a D-glucopyranose residue joined in sequence by (1->6) and (1->3) glycosidic bonds. It derives from a laminarabiose and a beta-D-Glcp-(1->6)-beta-D-Glcp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O[C@H]3[C@@H]([C@H](OC([C@@H]3O)O)CO)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24381',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 4-oxo monocarboxylic acid that is a synthetic musk fragrance and hydrophobic hapten with an indane core. It has a role as a hapten and a fragrance. It is a member of indanes and a 4-oxo monocarboxylic acid. It derives from a traseolide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C2=C(C1(C)C)C=C(C(=C2)C(=O)CCC(=O)O)C)C(C)C\\nThe natural language question is: The molecule is an N-nitrosourea that is urea in which one of the nitrogens is substituted by a 2-chloroethyl group and by a nitroso group, while the other nitrogen is substituted by a cyclohexyl group. An alkylating antineoplastic agent, it is used in the treatment of brain tumours, lung cancer, malignant melanoma and other solid tumours. It has a role as an alkylating agent and an antineoplastic agent. It is a member of N-nitrosoureas and an organochlorine compound.\\nThe corresponding SMILES representation is:\\nC1CCC(CC1)NC(=O)N(CCCl)N=O\\nThe natural language question is: The molecule is an (omega-1)-hydroxy fatty acid that is the conjugate base of 5-hydroxyhexanoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is an (omega-1)-hydroxy fatty acid anion, a medium-chain fatty acid anion and a hydroxy saturated fatty acid anion. It is a conjugate base of a 5-hydroxyhexanoic acid.\\nThe corresponding SMILES representation is:\\nCC(CCCC(=O)[O-])O\\nThe natural language question is: The molecule is a glycopeptidolipid antigen from clinically prominent members of the Mycobacterium avium serocomplex It has a role as an antigen.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(CC(=O)N[C@H](CC1=CC=CC=C1)C(=O)N[C@H]([C@@H](C)OC2[C@@H]([C@@H]([C@@H]([C@@H](O2)C)O)O)O[C@H]3[C@@H]([C@@H]([C@H]([C@@H](O3)C)O)O[C@H]4[C@@H]([C@H]([C@H]5[C@H](O4)CO[C@](O5)(C)C(=O)O)OC)O)O)C(=O)N[C@H](C)C(=O)N[C@@H](C)COC6[C@@H]([C@@H]([C@H]([C@@H](O6)C)OC)OC)O)O.O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a molybdenum coordination entity consisting of a cntral molybdenum in the +5 oxidation state coordinated to two oxygens and one sulfanyl group.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'O=[Mo]=O.[SH-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24134',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an organosulfonate salt obtained by combining nintedanib with one molar equivalent of ethanesulfonic acid. A kinase inhibitor used for the treatment of idiopathic pulmonary fibrosis and cancer. It has a role as an angiogenesis inhibitor, an antineoplastic agent, a fibroblast growth factor receptor antagonist, a tyrosine kinase inhibitor and a vascular endothelial growth factor receptor antagonist. It contains a nintedanib(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCS(=O)(=O)O.CN1CCN(CC1)CC(=O)N(C)C2=CC=C(C=C2)N=C(C3=CC=CC=C3)C4=C(NC5=C4C=CC(=C5)C(=O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25983',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is conjugate base of (2S)-2-hydroxyphytanic acid. It is a (2S)-2-hydroxy monocarboxylic acid anion, an isoprenoid and a long-chain fatty acid anion. It derives from a hexadecanoic acid and a hexadecanoate. It is a conjugate base of a (2S)-2-hydroxyphytanic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)CCCC(C)CCCC(C)CCCC(C)[C@@H](C(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18355',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 6-oxo monocarboxylic acid and a 2-hydroxy monocarboxylic acid. It derives from a sorbic acid. It is a conjugate acid of a 2-hydroxy-6-(2-hydroxyphenoxy)-6-oxo-cis,cis-hexa-2,4-dienoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)O)OC(=O)/C=C\\\\\\\\C=C(/C(=O)O)\\\\\\\\O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10276',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of 4-pyridones that is N-ethyl-4-pyridone that is substituted at positions 2, 3, and 6 by p-chlorophenyl, carboxy, and methyl groups, respectively. It is used (particularly as its potassium salt, known as karetazan-potassium) as a chemical hybridisation agent for commercial hybrid seed production. It is not approved for use within the European Union. It has a role as a chemical hybridisation agent. It is a monocarboxylic acid, a member of monochlorobenzenes, a biaryl and a member of 4-pyridones. It is a conjugate acid of a karetazan(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN1C(=CC(=O)C(=C1C2=CC=C(C=C2)Cl)C(=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13450',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a member of the class of triazoles that is 2,4-dihydro-3H-1,2,4-triazol-3-one which is substituted at positions 2, 4, and 5 by 5-(2-carboxy-2-chloroethyl)-4-chloro-2-fluorophenyl, difluoromethyl, and methyl groups, respectively. It is a member of triazoles, a member of monochlorobenzenes, a member of monofluorobenzenes and a monocarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=NN(C(=O)N1C(F)F)C2=C(C=C(C(=C2)CC(C(=O)O)Cl)Cl)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14538',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of furans that is a metabolite of ranitidine in rats and humans. It has a role as a drug metabolite, a rat metabolite and a human urinary metabolite. It is a C-nitro compound, a secondary amino compound, a member of furans and an organic sulfide.\\nThe corresponding SMILES representation is:\\nCNCC1=CC=C(O1)CSCCNC(=C[N+](=O)[O-])NC\\nThe natural language question is: The molecule is a acid that is propane in which three carboxy groups are attached at the C-1 position. It is a conjugate acid of a 1,1,1-propanetricarboxylate.\\nThe corresponding SMILES representation is:\\nCCC(C(=O)O)(C(=O)O)C(=O)O\\nThe natural language question is: The molecule is a member of the class of tetralins that is tetralin which is substituted by a methyl group at position 1 and at position 5, It is an ortho-fused bicyclic hydrocarbon and a member of tetralins. It derives from a hydride of a tetralin.\\nThe corresponding SMILES representation is:\\nCC1CCCC2=C(C=CC=C12)C\\nThe natural language question is: The molecule is an orpanophosphate oxoanion that is the trianion of ditrans,polycis-decaprenyl diphosphate. It is a conjugate base of a ditrans,polycis-decaprenyl diphosphate.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C/CC/C(=C/COP(=O)([O-])OP(=O)([O-])[O-])/C)/C)/C)/C)/C)/C)/C)/C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a polyunsaturated fatty acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of 11,12-epoxy-(5Z,8Z,14Z)-icosatrienoyl-CoA; major species at pH 7.3. It is a long-chain fatty acyl-CoA(4-) and a polyunsaturated fatty acyl-CoA(4-). It is a conjugate base of an 11,12-epoxy-(5Z,8Z,14Z)-icosatrienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\CC1C(O1)C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10469',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a hydroxy fatty acid anion that is the conjugate base of omega-hydroxydotriacontanoic acid, obtained by deprotonation of the carboxy group. It is an omega-hydroxy fatty acid anion, an ultra-long-chain fatty acid anion and an omega-hydroxy-ultra-long-chain fatty acid anion. It is a conjugate base of an omega-hydroxydotriacontanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCCCCCCCCCCCC(=O)[O-])CCCCCCCCCCCCCCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1533',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a benzo[5,6]cyclohepta[1,2-b]pyridine having a 1-methylpiperidin-4-ylidene group at the 11-position. It has a role as a H1-receptor antagonist and an anti-allergic agent. It is a benzocycloheptapyridine and a tertiary amine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1CCC(=C2C3=CC=CC=C3CCC4=C2N=CC=C4)CC1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21624',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organic heterotricyclic compound that is 5-chloro-10-hydroxy-8-methyl-1H-oxepino[4,3-b]chromene-3,11-dione which is substituted at positions 1, 5, 8, and 10 by methoxycarbonyl, chlorine, methyl, and hydroxy groups, respectively (the 1S enantiomer). Found in Monilia fructicola and in the mycoherbicide Alternaria sonchi. It has a role as a plant metabolite and a fungal metabolite. It is an organic heterotricyclic compound, an organochlorine compound, an epsilon-lactone and a methyl ester.\\nThe corresponding SMILES representation is:\\nCC1=CC(=C2C(=C1)OC3=C(C2=O)[C@H](OC(=O)C=C3Cl)C(=O)OC)O\\nThe natural language question is: The molecule is a methoxybenzoic acid that is O-methylsalicylic acid substituted by chloro groups at positions 3 and 6. It has a role as a xenobiotic, an environmental contaminant, a herbicide, a synthetic auxin and an agrochemical. It is a methoxybenzoic acid and a dichlorobenzene. It is a conjugate acid of a 3,6-dichloro-2-methoxybenzoate.\\nThe corresponding SMILES representation is:\\nCOC1=C(C=CC(=C1C(=O)O)Cl)Cl\\nThe natural language question is: The molecule is a 1-acylglycerone 3-phosphate(2-) obtained by deprotonation of the phosphate OH groups of 1-arachidonoylglycerone 3-phosphate; major species at pH 7.3. It is a conjugate base of a 1-arachidonoylglycerone 3-phosphate.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)OCC(=O)COP(=O)([O-])[O-]\\nThe natural language question is: The molecule is a pyrimidotriazine that is 1,6-dimethyl-1,5,6,7-tetrahydropyrimido[5,4-e][1,2,4]triazine with oxo groups at positions 5 and 7. It has a role as an antineoplastic agent, a toxin, a Wnt signalling inhibitor, an apoptosis inducer, a bacterial metabolite, an antibacterial agent and a virulence factor. It is a pyrimidotriazine and a carbonyl compound. It derives from a reumycin.\\nThe corresponding SMILES representation is:\\nCN1C2=NC(=O)N(C(=O)C2=NC=N1)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is an indole alkaloid that is vinorine bearing a hydroxy substituent at position 21. It is an indole alkaloid and a hemiaminal. It derives from a vinorine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C=C/1\\\\\\\\[C@@H]2C[C@H]3C4=NC5=CC=CC=C5[C@]46C[C@@H](C2[C@H]6OC(=O)C)N3[C@@H]1O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5402',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a phenanthroline that consists of 1,10-phenanthroline bearing two methyl groups at position 2 and 9 as well as two 4-sulfophenyl groups at positions 4 and 7. It has a role as a chelator. It is a member of phenanthrolines and an arenesulfonic acid. It derives from a hydride of a 1,10-phenanthroline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=C2C=CC3=C(C=C(N=C3C2=N1)C)C4=CC=C(C=C4)S(=O)(=O)O)C5=CC=C(C=C5)S(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11128',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a zwitterion resulting from a transfer of a proton from the phosphonate to the amino group of (2-amino-1-hydroxyethyl)phosphonate. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a tautomer of a 1-hydroxy-2-aminoethylphosphonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(O)P(=O)(O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4428',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an L-alanine derivative obtained by formal condensation of the carboxy group of fumaric acid monoamide with the side-chain amino group of 3-amino-L-alanine. It has a role as a bacterial metabolite. It is an enamide, a L-alanine derivative, a primary carboxamide and a secondary carboxamide. It derives from a 3-amino-L-alanine and a fumaric acid. It is a tautomer of a N(3)-fumaramoyl-(S)-2,3-diaminopropanoic acid zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H](C(=O)O)N)NC(=O)/C=C/C(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_572',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a withanolide that is 2,3-dihydrowithaferin A substituted by a beta-methoxy group at position 3. It has been isolated from the aerial parts of Physalis longifolia. It has a role as a metabolite and a plant metabolite. It is a delta-lactone, a 27-hydroxy steroid, a 4-hydroxy steroid, an ergostanoid, a primary alcohol, a secondary alcohol, a withanolide and an epoxy steroid. It derives from a withaferin A.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=O)O[C@H](C1)[C@@H](C)[C@H]2CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3C[C@@H]5[C@]6([C@@]4(C(=O)C[C@@H]([C@@H]6O)OC)C)O5)C)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26518',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 2-hydroxycarboxylate that is obtained by removal of a proton from the carboxylic acid group of 4-hydroxymandelic acid. It is a 2-hydroxy carboxylate and a member of phenols. It derives from a mandelate. It is a conjugate base of a 4-hydroxymandelic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(C(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22444',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the 2-(methoxycarbonylmethyl) derivative of fumaric acid. It is a dicarboxylic acid and a methyl ester. It derives from a fumaric acid. It is a conjugate acid of a (2E)-2-(methoxycarbonylmethyl)but-2-enedioate(2-).\\nThe corresponding SMILES representation is:\\nCOC(=O)C/C(=C\\\\\\\\C(=O)O)/C(=O)O\\nThe natural language question is: The molecule is a terpene lactone that is 3,4,5,5a,6,7,7a,10,10a,10b-decahydro-2H-1-oxabenzo[cd]cyclopenta[h]azulen-2-one substituted by a hydroxy group at position 10b and methyl groups at positions 3, 6, 9 and 10a. It is isolated from the West Indian gorgonian octocoral Pseudopterogorgia elisabethae and exhibits antitubercular and antimalarial activity. It has a role as a metabolite, an antimalarial and an antitubercular agent. It is a terpene lactone, an organic heterotetracyclic compound and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\nC[C@H]1CC[C@@H]2[C@H](C[C@@H]3C=C(C[C@@]3([C@]4(C2=C1C(=O)O4)O)C)C)C\\nThe natural language question is: The molecule is a member of the class of asterriquinones that is asterriquinone in which the hydrogens of both of the hydroxy groups have replaced by methyl groups.\\nThe corresponding SMILES representation is:\\nCC(C)(C=C)N1C=C(C2=CC=CC=C21)C3=C(C(=O)C(=C(C3=O)OC)C4=CN(C5=CC=CC=C54)C(C)(C)C=C)OC\\nThe natural language question is: The molecule is an alkaloid that is an enamide obtained by the formal condensation of 3-methylbut-2-enoic acid with 6-(2-amino-1-methoxyethyl)-4H-pyrido[2,3,4-kl]acridin-4-one. It is isolated from the Okinawan marine tunicate Cystodytes dellechiajei and exhibits cytotoxicity against human epidermoid carcinoma KB cells. It has a role as a metabolite and an antineoplastic agent. It is an alkaloid, an enone, an enamide, an organic heterotetracyclic compound, an ether and a secondary carboxamide. It derives from a 3-methylbut-2-enoic acid.\\nThe corresponding SMILES representation is:\\nCC(=CC(=O)NCC(C1=CC(=O)C2=NC=CC3=C2C1=NC4=CC=CC=C34)OC)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is inosine carrying a methyl substituent on the hydroxy group at position 2' on the ribose ring. It has a role as a metabolite. It derives from an inosine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CO[C@@H]1[C@@H]([C@H](O[C@H]1N2C=NC3=C2N=CNC3=O)CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_473',\n", + " 'prompt': \"Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a purine nucleoside in which guanine is attached to ribofuranose via a beta-N(9)-glycosidic bond. It has a role as a fundamental metabolite. It is a purines D-ribonucleoside and a member of guanosines. It derives from a guanine.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O)O)N=C(NC2=O)N\\nThe natural language question is: The molecule is a beta-D-galactosyl-(1->4)-beta-D-glucosyl-(1<->1')-N-acylsphinganine in which the acyl group specified is hexadecanoyl. It has a role as a mouse metabolite. It is a beta-D-galactosyl-(1->4)-beta-D-glucosyl-(1<->1')-N-acylsphinganine and a glycosyl-N-hexadecanoylsphinganine.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCC[C@H]([C@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O)NC(=O)CCCCCCCCCCCCCCC)O\\nThe natural language question is: The molecule is a 13-hydroxy-14,15-epoxy-(5Z,8Z,11Z)-icosatrienoic acid in which the three chiral centres at positions 13, 14 and 15 all have S-configuration. It has a role as a human metabolite. It is a conjugate acid of a (13S)-hydroxy-(14S,15S)-epoxy-(5Z,8Z,11Z)-icosatrienoate.\\nThe corresponding SMILES representation is:\\nCCCCC[C@H]1[C@@H](O1)[C@H](/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O\\nThe natural language question is: The molecule is a nucleotide conjugate consisting of CDP joined at the 1-position of 2,3-bis-O-(geranylgeranyl)-sn-glycerol via a diphosphate linkage. It derives from a CDP and a 2,3-bis-O-(geranylgeranyl)-sn-glycerol 1-phosphate. It is a conjugate acid of a CDP-2,3-bis-O-(geranylgeranyl)-sn-glycerol(2-).\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C/COC[C@@H](COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=CC(=NC2=O)N)O)O)OC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)/C)/C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a pyrazole that is 1H-pyrazole bearing an ethyl group at position 3, a 2-hydroxyphenyl group at position 2, and a 4-methoxyphenyl group at position 5. It has a role as a metabolite. It is a member of phenols, a member of pyrazoles and a monomethoxybenzene.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCC1=C(C(=NN1)C2=CC=C(C=C2)OC)C3=CC=CC=C3O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27015',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 7alpha-hydroxy steroid, a 12alpha-hydroxy steroid, a cholestanoid and a 3-oxo-Delta(4) steroid. It has a role as a human metabolite and a mouse metabolite.\\nThe corresponding SMILES representation is:\\nC[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](CC4=CC(=O)CC[C@]34C)O)O)C\\nThe natural language question is: The molecule is a DiHETE that is (5Z,8Z,10E,12E)-icosatetraenoic acid carrying two hydroxy substituents at positions 14 and 15 (the 14R,15S-stereoisomer). It has a role as a human xenobiotic metabolite. It is a conjugate acid of a 14(R),15(S)-DiHETE(1-).\\nThe corresponding SMILES representation is:\\nCCCCC[C@@H]([C@@H](/C=C/C=C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O)O\\nThe natural language question is: The molecule is an imine that is 4-methylidenecyclohexa-2,5-dien-1-imine in which both the hydrogens of the methylidene group are replaced by 4-aminophenyl groups. The hydrochloride salt is the histological dye 'pararosaniline'. It has a role as a fluorochrome and a histological dye. It is a substituted aniline and an imine. It is a conjugate base of a pararosaniline(1+).\\nThe corresponding SMILES representation is:\\nC1=CC(=N)C=CC1=C(C2=CC=C(C=C2)N)C3=CC=C(C=C3)N\\nThe natural language question is: The molecule is a tetrapeptide composed of L-asparagine, L-leucine and two L-aspartic acid units joined in sequence by peptide linkages. It has a role as a metabolite. It derives from a L-asparagine, a L-leucine and a L-aspartic acid.\\nThe corresponding SMILES representation is:\\nCC(C)C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(=O)O)C(=O)O)NC(=O)[C@H](CC(=O)N)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is a heparin octasaccharide consisting of 4-deoxy-alpha-L-threo-hex-4-enopyranuronosyl, 2-deoxy-2-(sulfoamino)-alpha-D-glucopyranosyl, (5xi)-D-xylo-hexopyranuronosyl, 2-deoxy-6-O-sulfo-2-(sulfoamino)-alpha-D-glucopyranosyl, (5xi)-D-xylo-hexopyranuronosyl, 2-deoxy-2-(sulfoamino)-alpha-D-glucopyranosyl, (5xi)-D-xylo-hexopyranuronosyl, and 2-deoxy-2-(sulfoamino)-alpha-D-glucopyranose units joined in sequence by (1->4) linkages. Sequence: DHexA-GlcNSO3-HexA-GlcNSO3(6SO4)-HexA-GlcNSO3-HexA-GlcNSO3. It is a heparin octasaccharide, an oligosaccharide sulfate and an amino octasaccharide.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=C(O[C@H]([C@@H]([C@H]1O)O)O[C@@H]2[C@H](O[C@@H]([C@@H]([C@H]2O)NS(=O)(=O)O)O[C@H]3[C@@H]([C@H](C(OC3C(=O)O)O[C@@H]4[C@H](O[C@@H]([C@@H]([C@H]4O)NS(=O)(=O)O)O[C@H]5[C@@H]([C@H](C(OC5C(=O)O)O[C@@H]6[C@H](O[C@@H]([C@@H]([C@H]6O)NS(=O)(=O)O)O[C@H]7[C@@H]([C@H](C(OC7C(=O)O)O[C@@H]8[C@H](O[C@@H]([C@@H]([C@H]8O)NS(=O)(=O)O)O)CO)O)O)CO)O)O)COS(=O)(=O)O)O)O)CO)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2844',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a trisaccharide composed of an alpha-L-rhamnosyl residue linked (1->2) to a beta-D-galactosyl residue which is in turn linked (1->4) to beta-L-rhamnose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@H]2O[C@H]3[C@@H](O[C@@H]([C@@H]([C@@H]3O)O)O)C)CO)O)O)O)O)O\\nThe natural language question is: The molecule is tetraanion of 3-oxotetradecanoyl-CoA arising from deprotonation of the phosphate and diphosphate functions; principal microspecies at pH 7.3. It is a conjugate base of a 3-oxotetradecanoyl-CoA.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nThe natural language question is: The molecule is a polyprenol diphosphate having (Z,Z,Z)-geranylgeraniyl (nerylneryl) as the polyprenyl component. It derives from a (Z,Z,Z)-geranylgeraniol. It is a conjugate acid of a nerylneryl diphosphate(3-).\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)(O)OP(=O)(O)O)/C)/C)/C)C\\nThe natural language question is: The molecule is the simplest member of the class toluenes consisting of a benzene core which bears a single methyl substituent. It has a role as a non-polar solvent, a cholinergic antagonist, a neurotoxin and a fuel additive. It is a methylbenzene, a volatile organic compound and a member of toluenes.\\nThe corresponding SMILES representation is:\\nCC1=CC=CC=C1\\nNext, you will be given a sample for test.The natural language question is: The molecule is a three-membered deoxy oligosaccharide which has a 6-deoxy-N-acetyl-D-glucosamine unit at the reducing end with an alpha-D-galactosyl-(1->3)-beta-D-galactosyl group attached at the 4-position. It is an amino trisaccharide, a deoxy oligosaccharide derivative and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@H]([C@@H]([C@H](C(O1)O)NC(=O)C)O)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10858',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the carboxy, thiocarboxy and phosphate OH groups of pyridinium-3-carboxy-5-thiocarboxylic acid mononucleotide; major species at pH 7.3. It is an organic sulfur anion, an organophosphate oxoanion and a monocarboxylic acid anion. It is a conjugate base of a pyridinium-3-carboxy-5-thiocarboxylic acid mononucleotide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(C=[N+](C=C1C(=O)[S-])[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])[O-])O)O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28188',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an UDP-D-galacturonic acid. It has a role as a mouse metabolite. It derives from an alpha-D-galacturonic acid. It is a conjugate acid of an UDP-alpha-D-galacturonate(3-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)C(=O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1977',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (9Z,12Z)-hexadecadienoyl-CoA; major species at pH 7.3. It is a conjugate base of a (9Z,12Z)-hexadecadienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17022',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hydroxamic acid anion resulting from the removal of a proton from each of the hydroxamic acid groups of desferrialbomycin epsilon. It is a conjugate base of a desferrialbomycin epsilon.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N(CCC[C@@H](C(=O)N[C@@H](CCCN(C(=O)C)[O-])C(=O)N[C@@H](CCCN(C(=O)C)[O-])C(=O)N[C@@H](CO)C(=O)N[C@H]([C@@H]([C@@H]1[C@@H]([C@H]([C@@H](S1)N2C=CC(=N)N(C2=O)C)O)O)O)C(=O)O)N)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4840',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is conjugate base of N-arachidonoylglycine. It has a role as a human metabolite. It is a N-acylglycinate and a N-(fatty acyl)-glycine(1-). It is a conjugate base of a N-arachidonoylglycine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)NCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26681',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a dipeptide composed of L-asparagine and glycine joined by a peptide linkage. It has a role as a metabolite. It derives from a L-asparagine and a glycine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H](C(=O)NCC(=O)O)N)C(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1549',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a linear tetrapyrrole anion obtained by deprotonation of both the carboxy groups of 15-oxo-beta-bilirubin. It is a linear tetrapyrrole anion and a dicarboxylic acid dianion. It is a conjugate base of a 15-oxo-beta-bilirubin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(/C(=C/C2=C(C(=C(N2)C(=O)C3=C(C(=C(N3)/C=C\\\\\\\\4/C(=C(C(=O)N4)C=C)C)C=C)C)C)CCC(=O)[O-])/NC1=O)CCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7816',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an amino disaccharide consisting of 2-acetamido-2-deoxy-beta-D-glucopyranose and beta-D-mannopyranose joined in sequence by a (1->2) glycosidic bond. It is an amino disaccharide and a member of acetamides. It derives from a beta-D-mannose and a N-acetyl-beta-D-glucosamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@H]2[C@H]([C@@H]([C@H](O[C@H]2O)CO)O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27448',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a carbamate ester. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor, a carbamate insecticide and an agrochemical. It derives from a methylcarbamic acid and a 3,4-xylenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C=C(C=C1)OC(=O)NC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21668',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a homocysteine that has L configuration. It has a role as a mouse metabolite. It is a homocysteine and a serine family amino acid. It is a conjugate acid of a L-homocysteinate. It is a tautomer of a L-homocysteine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CS)[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22208',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a metal sulfate in which the metal component is manganese in the +2 oxidation state. It has a role as a nutraceutical. It is a metal sulfate and a manganese molecular entity. It contains a manganese(2+).\\nThe corresponding SMILES representation is:\\n[O-]S(=O)(=O)[O-].[Mn+2]\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of 3-(1H-indol-3-yl)propanoic acid. It has a role as a human metabolite. It is a conjugate base of a 3-(1H-indol-3-yl)propanoic acid.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C(=CN2)CCC(=O)[O-]\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (2E,13Z)-docosadienoic acid. It is a long-chain fatty acyl-CoA, an unsaturated fatty acyl-CoA and an 11,12-saturated fatty acyl-CoA. It is a conjugate acid of a (2E,13Z)-docosadienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is a hydrochloride obtained by reaction of oxycodone with one molar equivalent of hydrochloric acid. It is a moderately potent opioid analgesic, generally used for relief of moderate to severe pain. It has a role as a mu-opioid receptor agonist, an antitussive and an opioid analgesic. It contains an oxycodone(1+).\\nThe corresponding SMILES representation is:\\nCN1CC[C@]23[C@@H]4C(=O)CC[C@]2([C@H]1CC5=C3C(=C(C=C5)OC)O4)O.Cl\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organic heterotricyclic compound that is isolated from the fermentation broth of Curvularia sp. RK97-F166. It has a role as a fungal metabolite. It is a cyclic hemiketal, a bridged compound, an organic heterotricyclic compound and a spiro-epoxide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C[C@H]2[C@](CC1)([C@]3(C([C@@](O2)(C[C@@]34CO4)O)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15655',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an anthracycline antibiotic that is aclacinomycin A in which the ketone function on the trisaccharide fragment has undergone 2,3-dehydrogenation to afford the corresponding enone. It has a role as an antimicrobial agent and a metabolite. It is an aminoglycoside, an anthracycline, a member of phenols, a polyketide, a tertiary alcohol, a trisaccharide derivative, an enone, a member of tetracenequinones and a methyl ester. It derives from an aklavinone. It is a conjugate base of an aclacinomycin Y(1+). It is a tautomer of an aclacinomycin Y zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@]1(C[C@@H](C2=C(C3=C(C=C2[C@H]1C(=O)OC)C(=O)C4=C(C3=O)C(=CC=C4)O)O)O[C@H]5C[C@@H]([C@@H]([C@@H](O5)C)O[C@H]6C[C@@H]([C@@H]([C@@H](O6)C)O[C@H]7C=CC(=O)[C@@H](O7)C)O)N(C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8573',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an etabonate ester, an 11beta-hydroxy steroid, a steroid ester, an organochlorine compound, a steroid acid ester and a 3-oxo-Delta(1),Delta(4)-steroid. It has a role as an anti-inflammatory drug. It derives from a loteprednol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCOC(=O)O[C@@]1(CC[C@@H]2[C@@]1(C[C@@H]([C@H]3[C@H]2CCC4=CC(=O)C=C[C@]34C)O)C)C(=O)OCCl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24042',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a diterpenoid of the class of daphnane-type terpenes. Isolated from Trigonostemon reidioides, it exhibits cytotoxicity against various cancer cell lines. It has a role as a metabolite and an antineoplastic agent. It is a benzoate ester, a diterpenoid, an ortho ester, an epoxide and a terpene lactone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C[C@H]2[C@@]34[C@@H]([C@H]5[C@@]6([C@@H]([C@@H]3[C@H]7[C@](O7)([C@H]([C@@]2([C@H]1OC(=O)/C=C/C=C\\\\\\\\[C@@H](C8CCC(CC8)C[C@@]6(C)O)OC(=O)C9=CC=CC=C9)O)O)CO)O[C@](O5)(O4)C1=CC=CC=C1)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15019',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a N(5)-alkylglutamine where the alkyl group is ethyl. It has been isolated from green tea. It has a role as a neuroprotective agent and a plant metabolite. It is a tautomer of a N(5)-ethyl-L-glutamine zwitterion.\\nThe corresponding SMILES representation is:\\nCCNC(=O)CC[C@@H](C(=O)O)N\\nThe natural language question is: The molecule is a derivative of scyllo-inositol having a guanidino group in place of the 1-hydroxy group. It derives from a scyllo-inositol. It is a conjugate base of a 1-guanidiniumyl-1-deoxy-scyllo-inositol(1+).\\nThe corresponding SMILES representation is:\\n[C@H]1([C@H](C([C@H]([C@@H](C1N=C(N)N)O)O)O)O)O\\nThe natural language question is: The molecule is a 2-(4-isopropyl-4-methyl-5-oxo-4,5-dihydro-1H-imidazol-2-yl)nicotinic acid that has S configuration. It is a conjugate acid of a (S)-imazapyr(1-). It is an enantiomer of a (R)-imazapyr.\\nThe corresponding SMILES representation is:\\nCC(C)[C@]1(C(=O)NC(=N1)C2=C(C=CC=N2)C(=O)O)C\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (2E,14Z,17Z,20Z,23Z,26Z,29Z)-dotriacontaheptaenoic acid. It is an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It is a conjugate acid of a (2E,14Z,17Z,20Z,23Z,26Z,29Z)-dotriacontaheptaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a withanolide that is 5,6:22,26-diepoxyergost-24-ene substituted by hydroxy groups at positions 4, 19 and 27, a methoxy group at position 3 and oxo groups at positions 1 and 26. It has been isolated from the aerial parts of Physalis longifolia. It has a role as a metabolite and a plant metabolite. It is a delta-lactone, a 27-hydroxy steroid, a 4-hydroxy steroid, a 19-hydroxy steroid, an ergostanoid, a primary alcohol, a secondary alcohol, a withanolide and an epoxy steroid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=O)O[C@H](C1)[C@@H](C)[C@H]2CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3C[C@@H]5[C@]6([C@@]4(C(=O)C[C@@H]([C@@H]6O)OC)CO)O5)C)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23278',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the aromatic diazonium ion that is diazotised 2-aminophenylarsonic acid. It has a role as a hapten. It derives from a phenylarsonate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)[N+]#N)[As](=O)(O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13644',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the conjugate acid of 1-amino-1-deoxy-scyllo-inositol arising from protonation of the primary amino group. It is a conjugate acid of a 1-amino-1-deoxy-scyllo-inositol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@H]1([C@H](C([C@H]([C@@H](C1[NH3+])O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8718',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an ether that is 2-(1H-imidazol-1-yl)-1-(4-methoxyphenyl)ethanol in which the hydrogen of the hydroxy group has been substituted by a 3-(4-methoxyphenyl)propyl group. It has a role as a TRP channel blocker. It is a member of imidazoles, a monomethoxybenzene and an ether. It is a conjugate base of a SKF-96365 free base(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC=C(C=C1)CCCOC(CN2C=CN=C2)C3=CC=C(C=C3)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5843',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an N-acylglycinate that is the conjugate base of N-tetracosanoylglycine, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a N-tetracosanoylglycine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC(=O)NCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18851',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a 1,2-diacyl-sn-glycero-3-phosphocholine in which the acyl groups specified at positions 1 and 2 are palmitoyl and (11Z,14Z)-eicosadienoyl respectively. It has a role as a mouse metabolite. It is a 1,2-diacyl-sn-glycero-3-phosphocholine and a phosphatidylcholine 36:2. It derives from a hexadecanoic acid and an (11Z,14Z)-icosadienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23405',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of pterocarpans that is (6aR,11aR)-pterocarpan substituted by hydroxy groups at positions 3 and 8, methoxy groups at positions 1 and 9 and prenyl groups at positions 2 and 10. Isolated from the roots of Lespedeza floribunda, it acts as a melanin synthesis inhibitor. It has a role as a melanin synthesis inhibitor and a plant metabolite. It is an aromatic ether, a member of phenols and a member of pterocarpans.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC1=C(C2=C(C=C1O)OC[C@@H]3[C@H]2OC4=C(C(=C(C=C34)O)OC)CC=C(C)C)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4350',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a polyunsaturated fatty acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate OH groups of (9Z,12Z)-hexadeca-9,12,15-trienoyl-CoA; major species at pH 7.3. It is a polyunsaturated fatty acyl-CoA(4-) and a long-chain fatty acyl-CoA(4-). It is a conjugate base of a (9Z,12Z)-hexadeca-9,12,15-trienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CC=C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1064',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organic heteropentacyclic compound comprising (2aS,5aR,8aR,8bS)-8a-hydroxy-2a,5a-dimethyldecahydro-8H-naphtho[1,8-bc]furan-8-one ortho-fused to C-6 and C-7 of 1,4-naphthoquinone. An antiplasmodial drug isolated from New Caledonian deep water sponge. It has a role as a metabolite and an antiplasmodial drug. It is an organic heteropentacyclic compound, a cyclic hemiketal and a member of p-quinones. It derives from a 1,4-naphthoquinone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CCC[C@]3([C@@H]1[C@](C(=O)C4=C3C=C5C(=O)C=CC(=O)C5=C4)(OC2)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23079',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a galactosylceramide sulfate(1-) in which the ceramide N-acyl group is specified as (R)-2-hydroxybehenoyl. It is a conjugate base of a 1-(3-O-sulfo-beta-D-galactosyl)-N-[(2R)-2-hydroxybehenoyl]sphingosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCC[C@H](C(=O)N[C@@H](CO[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)OS(=O)(=O)[O-])O)[C@@H](/C=C/CCCCCCCCCCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12606',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 2-acetyl-1-alkyl-sn-glycero-3-phosphocholine betaine which has octadecyl as the alkyl group. PAF is a potent phospholipid activator and mediator of many leukocyte functions, including platelet aggregation, inflammation, and anaphylaxis. It has a role as an antihypertensive agent, a beta-adrenergic antagonist, a hematologic agent, a vasodilator agent and a bronchoconstrictor agent.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCOC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)C\\nThe natural language question is: The molecule is an apo carotenoid triterpenoid that is tetracosane containing double bonds at the 2-3, 6-7, 10-11, 12-13, 14-15, 18-19, and 22-23 positions, and substituted by methyl groups at positions 2, 6, 10, 15, 19, and 23. It is an apo carotenoid triterpenoid, a triterpene and a polyene.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/C=C/C=C(/CC/C=C(/CCC=C(C)C)\\\\\\\\C)\\\\\\\\C)/C)/C)C\\nThe natural language question is: The molecule is an isochromane that is 3,4-dihydroisocoumarin with a butyl substituent at position 7, hydroxy substituents at positions 6 and 8 and a 3E-pent-3-en-1-yl group at position 3. It is isolated from an endophytic fungus, Geotrichum. It has a role as a metabolite, an antimalarial, an antifungal agent and an antitubercular agent. It is a member of isochromanes and a member of phenols. It derives from a 3,4-dihydroisocoumarin.\\nThe corresponding SMILES representation is:\\nCCCCC1=C(C=C2C[C@H](OC(=O)C2=C1O)CC/C=C/C)O\\nThe natural language question is: The molecule is a beta-diketone isolated from Ochrocarpos punctatus and has been shown to exhibit antineoplastic activity. It has a role as a metabolite and an antineoplastic agent. It is a cyclic ketone, a peroxol, a bridged compound, a cyclic ether, an organic heterotricyclic compound, a beta-diketone, an enone, an aromatic ketone and a beta-triketone.\\nThe corresponding SMILES representation is:\\nCC(=CCC1CC2(C3=C(CC(C(O3)(C)C)OO)C(=O)C(C2=O)(C1(C)C)C(=O)C4=CC=CC=C4)CC=C(C)C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a tautomer of pyrrole that has the double bonds at positions 2 and 4. It is a pyrrole and a secondary amine. It is a tautomer of a 2H-pyrrole and a 3H-pyrrole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CNC=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6807',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an (omega-1)-hydroxy fatty acid ascaroside obtained by formal condensation of the alcoholic hydroxy group of (2E,14R)-14-hydroxypentadec-2-enoic acid with ascarylopyranose (the alpha anomer). It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is an alpha,beta-unsaturated monocarboxylic acid and an (omega-1)-hydroxy fatty acid ascaroside. It derives from a (2E,14R)-14-hydroxypentadec-2-enoic acid. It is a conjugate acid of an ascr#25(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCCCCCC/C=C/C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2404',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a myo-inositol trisphosphate. It has a role as a mouse metabolite. It derives from a myo-inositol. It is a conjugate acid of a 1D-myo-inositol 1,3,4-trisphosphate(6-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@H]1([C@H]([C@@H]([C@H]([C@H]([C@H]1OP(=O)(O)O)O)OP(=O)(O)O)OP(=O)(O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10778',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a hydrochloride that is the monohydrochloride salt of benserazide. An aromatic-L-amino-acid decarboxylase inhibitor (DOPA decarboxylase inhibitor) that does not enter the central nervous system, it is used as an adjunct to levodopa in the treatment of parkinsonism. By preventing the conversion of levodopa to dopamine in the periphery, it causes an increase in the amount of levodopa reaching the central nervous system and so reduces the required dose. Benserazide hydrochloride has no antiparkinson actions when given alone. It has a role as an antiparkinson drug, an EC 4.1.1.28 (aromatic-L-amino-acid decarboxylase) inhibitor and a dopaminergic agent. It contains a benserazide(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1CNNC(=O)C(CO)N)O)O)O.Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13277',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 2-halobenzoic acid that is benzoic acid carrying a fluoro substituent at position 2. It is a fluorobenzoic acid and a 2-halobenzoic acid. It is a conjugate acid of a 2-fluorobenzoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)C(=O)O)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19297',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hopanoid that is hop-22(29)-ene substituted by hydroxy groups at positions 7 and 15 respectively. It has been isolated from Hypocrella species. It has a role as a fungal metabolite. It is a hopanoid, a pentacyclic triterpenoid and a diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=C)[C@H]1CC[C@]2([C@H]1C[C@@H]([C@@]3([C@@H]2CC[C@H]4[C@]3([C@H](C[C@@H]5[C@@]4(CCCC5(C)C)C)O)C)C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23738',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a HETE that is (5Z,8Z,12E,14Z)-icosa-5,8,12,14-tetraenoic acid substituted at position 11 by a hydroxy group. It has a role as a mouse metabolite. It derives from an icosa-5,8,12,14-tetraenoic acid. It is a conjugate acid of an 11-HETE(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C=C\\\\\\\\C(C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12717',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a disaccharide derivative that is (S)-naringenin substituted by a 2-O-(alpha-L-rhamnopyranosyl)-beta-D-glucopyranosyl moiety at position 7 via a glycosidic linkage. It has a role as a metabolite, an antineoplastic agent and an anti-inflammatory agent. It is a disaccharide derivative, a dihydroxyflavanone, a member of 4'-hydroxyflavanones, a (2S)-flavan-4-one and a neohesperidoside. It derives from a (S)-naringenin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC3=CC(=C4C(=O)C[C@H](OC4=C3)C5=CC=C(C=C5)O)O)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28182',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of imidazo[2,1-b][1,3]oxazines in which the hydrogen at position 2 is substituted by a nitro group and in which the oxazine ring is fully saturated. It is a C-nitro compound, a member of imidazo[2,1-b][1,3]oxazines and a bicyclic nitroimidazole.\\nThe corresponding SMILES representation is:\\nC1CN2C=C(N=C2OC1)[N+](=O)[O-]\\nThe natural language question is: The molecule is a hydroxycalciol that is a synthetic analogue of vitamin D3 which contains an oxolane ring and exhibits weak vitamin D receptor agonist activity. It has a role as a vitamin D receptor agonist. It is a hydroxycalciol, a member of oxolanes and a member of D3 vitamins.\\nThe corresponding SMILES representation is:\\nC[C@@H]([C@H]1CC[C@@H]\\\\\\\\2[C@@]1(CCC/C2=C\\\\\\\\C=C/3\\\\\\\\C[C@H](C[C@H](C3=C)O)O)C)[C@@H]4CC[C@@H](O4)C(C)(C)O\\nThe natural language question is: The molecule is a glycosyloxyflavone that is myricetin with a beta-L-galactosyl residue attached at position 3. It has a role as a metabolite. It is a beta-L-galactoside, a monosaccharide derivative, a pentahydroxyflavone and a glycosyloxyflavone. It derives from a beta-L-galactose and a myricetin.\\nThe corresponding SMILES representation is:\\nC1=C(C=C(C(=C1O)O)O)C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C@@H]4[C@H]([C@@H]([C@@H]([C@@H](O4)CO)O)O)O\\nThe natural language question is: The molecule is a phosphatidylcholine 36:2 in which the acyl groups specified at positions 1 and 2 are octadecanoyl and (6Z,9Z)-octadecadienoyl respectively. It has a role as a mouse metabolite. It derives from a (6Z,9Z)-octadecadienoic acid and an octadecanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a diterpene alkaloid with formula C25H41NO9 that is isolated from several Aconitum species. It has a role as a plant metabolite, a human urinary metabolite, a NF-kappaB inhibitor and a xenobiotic. It is a bridged compound, a diterpene alkaloid, an organic heteropolycyclic compound, a polyether, a tertiary amino compound, a pentol, a secondary alcohol and a tertiary alcohol. It derives from a hydride of an aconitane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN1C[C@@]2([C@@H](C[C@@H]([C@@]34[C@@H]2[C@H]([C@@H](C31)[C@@]5([C@@H]6[C@H]4C[C@@]([C@@H]6O)([C@H]([C@@H]5O)OC)O)O)OC)OC)O)COC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15012',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of chalcones that is trans-chalcone substituted by hydroxy groups at positions 4, 2' and 4', a methoxy group at position 6' and methyl groups at positions 3' and 5'. Isolated from the buds of Cleistocalyx operculatus, it has been shown to exhibit inhibitory effects on the viral neuraminidases from two influenza viral strains, H1N1 and H9N2. It has a role as a plant metabolite and an EC 3.2.1.18 (exo-alpha-sialidase) inhibitor. It is a member of chalcones, a monomethoxybenzene and a polyphenol. It derives from a trans-chalcone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=C(C(=C(C(=C1O)C(=O)/C=C/C2=CC=C(C=C2)O)OC)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3205',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a monocarboxylic acid amide resulting from the formal condensation of the carboxy group of 4-hydroxy-2-methyl-2H-1,2-benzothiazine-3-carboxylic acid 1,1-dioxide with the exocyclic nitrogen of 2-aminopyridine. A non-steroidal anti-inflammatory drug of the oxicam class, it is used to relieve pain and works by preventing the production of endogenous prostaglandins involved in the mediation of pain, stiffness, tenderness and swelling. It has a role as an analgesic, a cyclooxygenase 1 inhibitor, a non-steroidal anti-inflammatory drug, an EC 1.14.99.1 (prostaglandin-endoperoxide synthase) inhibitor and an antirheumatic drug. It is a benzothiazine, a member of pyridines and a monocarboxylic acid amide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1C(=C(C2=CC=CC=C2S1(=O)=O)O)C(=O)NC3=CC=CC=N3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13941',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a dicarboxylic acid monoamide resulting from the formal condensation of one of the carboxy groups of terephthalic acid with ammonia. It is a carbamoylbenzoic acid and a dicarboxylic acid monoamide. It derives from a terephthalic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(=O)N)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22413',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an iron chelate resulting from the deprotonation of all four carboxy groups of ethylenediaminetetraacetic acid and the addition of an iron(3+) and a sodium ion. It is used for the treatment of iron deficiency anaemia. It is an iron chelate and an organic sodium salt. It contains an iron(3+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CN(CC(=O)O)CC(=O)[O-])N(CC(=O)O)CC(=O)O.[Na+].[Fe]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3788',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is cyclic tetraadenylate; major species at pH 7.3. It is an adenyl ribonucleotide, a cyclic oligonucleotide and an organophosphate oxoanion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(OC[C@@H]5[C@H]([C@H]([C@@H](O5)N6C=NC7=C(N=CN=C76)N)O)OP(=O)(OC[C@@H]8[C@H]([C@H]([C@@H](O8)N9C=NC2=C(N=CN=C29)N)O)OP(=O)(OC[C@@H]2[C@H]([C@H]([C@@H](O2)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O1)[O-])[O-])[O-])[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19537',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a steroid acid resulting from the oxidation of one of the terminal methyl groups of cholesterol to the corresponding aldehyde. It has a role as a bacterial metabolite. It is a steroid acid, a monocarboxylic acid, a 3beta-sterol, a cholestanoid and a 3beta-hydroxy-Delta(5)-steroid. It derives from a cholesterol. It is a conjugate acid of a 3beta-hydroxycholest-5-en-26-oate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCC(C)C(=O)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC=C4[C@@]3(CC[C@@H](C4)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17362',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a long-chain unsaturated fatty acid anion that is the conjugate base of (9Z,13S)-12,13-epoxyoctadeca-9,11-dienoic acid arising from deprotonation of the carboxylic acid function. It is a long-chain fatty acid anion and a polyunsaturated fatty acid anion. It is a conjugate base of a (9Z,13S)-12,13-epoxyoctadeca-9,11-dienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@H]1/C(=C/C=C\\\\\\\\CCCCCCCC(=O)[O-])/O1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17605',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an iminium salt composed of 4-{(4-anilinonaphthalen-1-yl)[4-(dimethylamino)phenyl]methylidene}-N,N-dimethylcyclohexa-2,5-dien-1-iminium and chloride ions in a 1:1 ratio. It binds to nucleic acids and can be used in standardised staining techniques suitable for automated cell-pattern recognition. It has a role as a histological dye and a fluorochrome. It is an iminium salt and an organic chloride salt. It contains a victoria blue B(1+).\\nThe corresponding SMILES representation is:\\nCN(C)C1=CC=C(C=C1)C(=C2C=CC(=[NH+]C3=CC=CC=C3)C4=CC=CC=C24)C5=CC=C(C=C5)N(C)C.[Cl-]\\nThe natural language question is: The molecule is an acyl-CoA(4-) species arising from deprotonation of the phosphate and diphosphate OH groups of (E,E)-piperonyl-CoA; major species at pH 7.3. It is an acyl-CoA(4-) and a member of benzodioxoles. It is a conjugate base of an (E,E)-piperonyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)/C=C/C=C/C4=CC5=C(C=C4)OCO5)O\\nThe natural language question is: The molecule is a mycolate ester formed by esterification of (21E)-3-hydroxy-35-[(icosan-2-yl)oxy]-35-oxo-2-pentacosylpentatriacont-21-enoic acid with the 6-OH of D-glucose; produced by Mycobacterium phlei. It is a monosaccharide derivative and a mycolate ester.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCCC(C(CCCCCCCCCCCCCCCCC/C=C/CCCCCCCCCCCCC(=O)OC(C)CCCCCCCCCCCCCCCCCC)O)C(=O)OC[C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O\\nThe natural language question is: The molecule is an anionic unsymmetrical C7 cyanine dye having differentially substituted indoleinine groups at each end. It has a role as a fluorochrome. It is an organosulfonate oxoanion, an indolium ion and a cyanine dye.\\nThe corresponding SMILES representation is:\\nCCN\\\\\\\\1C2=C(C3=C(C=C2)C(=CC(=C3)S(=O)(=O)[O-])S(=O)(=O)[O-])C(/C1=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\C4=[N+](C5=C(C4(C)C)C=C(C=C5)NC(=O)CI)C)(C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is an N-acyl-15-methylhexadecasphing-4-enine-1-phosphocholine in which the acyl group has 26 carbons and 0 double bonds and is 2-hydroxylated. It derives from a 15-methylhexadecasphing-4-enine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCC(C(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCCC(C)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11424',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an aminobenzenesulfonate that is the conjugate base of 2-aminobenzenesulfonic acid. It is a conjugate base of a 2-aminobenzenesulfonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)N)S(=O)(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4123',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is the conjugate base of adenosylcobalamin 5'-phosphate, formed by loss of two protons from the 5'-phosphate group. It is an organophosphate oxoanion and an alkylcob(III)alamin. It is a conjugate base of an adenosylcobalamin 5'-phosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=CC2=C(C=C1C)N(C=N2)[C@@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])[O-])OP(=O)([O-])O[C@H](C)CNC(=O)CC[C@@]\\\\\\\\4([C@H]([C@@H]5[C@]6([C@@]([C@@H](C(=N6)/C(=C\\\\\\\\7/[C@@]([C@@H](C(=N7)/C=C\\\\\\\\8/C([C@@H](C(=N8)/C(=C4\\\\\\\\[N-]5)/C)CCC(=O)N)(C)C)CCC(=O)N)(C)CC(=O)N)/C)CCC(=O)N)(C)CC(=O)N)C)CC(=O)N)C)O.[CH2-][C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)O.[Co]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2525',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the dianion of any alpha-D-hexose 1-phosphate arising from deprotonation of the phosphate OH groups; major species at pH 7.3. It is a conjugate base of an alpha-D-hexose 1-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1C(C(C([C@H](O1)OP(=O)([O-])[O-])O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17091',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is dizwitterionic form of N(5)-(L-1-carboxyethyl)-L-ornithine having anionic carboxy groups and cationic amino groups; major species at pH 7.3. It is a tautomer of a N(5)-(L-1-carboxyethyl)-L-ornithine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](C(=O)[O-])[NH2+]CCC[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2139',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a thia-alkylglucosinolic acid that consists of 1-thio-beta-D-glucopyranose attached to a 6-(methylsulfanyl)-N-(sulfooxy)hexanimidoyl group at the anomeric sulfur. It is a thia-alkylglucosinolic acid and an organic sulfide. It is a conjugate acid of a glucoberteroin(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CSCCCCC/C(=N/OS(=O)(=O)O)/S[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28804',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of 1,4-benzoquinones that is p-benzoquinone substituted by a methoxy group at position 2 and a 1-phenylallyl group at position 5. Isolated from the heartwood of Dalbergia louveli, it exhibits antiplasmodial activity. It has a role as a metabolite and an antiplasmodial drug. It is an enol ether and a member of 1,4-benzoquinones.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=O)C(=CC1=O)[C@H](C=C)C2=CC=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_32',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a p-menthane monoterpenoid that consists of cyclohex-2-enone having methyl and isopropenyl substituents at positions 2 and 5, respectively. It has a role as an allergen. It is a member of carvones and a botanical anti-fungal agent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CCC(CC1=O)C(=C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29336',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a branched amino pentasaccharide consisting of an alpha-D-Gal residue at the reducing end having an alpha-L-Fuc-(1->3)-beta-D-GalNAc-(1->4)-[alpha-L-Fuc-(1->3)]-beta-D-GlcNAc moiety attached at the 3-position. It has a role as an epitope.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@@H]2O)CO)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O[C@H]4[C@H]([C@@H]([C@@H]([C@@H](O4)C)O)O)O)NC(=O)C)O[C@H]5[C@H]([C@H](O[C@@H]([C@@H]5O)O)CO)O)CO)NC(=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3325',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a monocarboxylic acid that is 3,5-dioxocyclohexanecarboxylic acid substituted by a cyclopropyl(hydroxy)methylidene group at position 4. It is a metabolite of the plant growth regulator trinexapac-ethyl. It has a role as a marine xenobiotic metabolite, an agrochemical, a plant growth regulator and a gibberellin biosynthesis inhibitor. It is a member of cyclohexanones, a monocarboxylic acid, a beta-hydroxy ketone, an enol and a member of cyclopropanes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC1C(=C2C(=O)CC(CC2=O)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7320',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a glycosyl glycoside consisting of beta-D-galactopyranose and D-galactopyranose residues joined by a (1->1) glycosidic bond. It is a glycosyl glycoside and a partially-defined glycan. It derives from a beta-D-galactose and a D-galactopyranose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)OC2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8505',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a tetracyclic diterpenoid with formula C21H26O3, originally isolated from Tripterygium wilfordii. It has a role as a plant metabolite. It is a gamma-lactone, an aromatic ether, an organic heterotetracyclic compound and a tetracyclic triterpenoid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C1=C(C2=C(C=C1)[C@]3(CCC4=C([C@@H]3CC2)COC4=O)C)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27586',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a cinnamate ester obtained by formal condensation of the carboxy group of a 4-coumaric acid with one of the hydroxy groups of tartaric acid. It is a cinnamate ester, a dicarboxylic acid, a member of phenols and a tetraric acid derivative. It derives from a 4-coumaric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C/C(=O)OC(C(C(=O)O)O)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4525',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an anthracycline antibiotic isolated from the culture broth of Nocardia sp. MJ896-43F17. It exhibits significant antimycobacterial activity against several drug-resistant Mycobacterium smegmatis strains. It has a role as an antimycobacterial drug. It is an anthracycline antibiotic, an aminoglycoside, a deoxy hexoside, an aromatic ether and a tertiary amino compound.\\nThe corresponding SMILES representation is:\\nCC1C(C(CC(O1)OC2C(C(C(C3=CC4=C(C(=C23)O)C(=O)C5=C(C=CC(=C5C4=O)OC)O)O)(C)O)OC)(C)N(C)C)O\\nThe natural language question is: The molecule is an aromatic amino-acid anion that is the conjugate base of phenylalanine, arising from deprotonation of the carboxy group. It is an alpha-amino-acid anion and an aromatic amino-acid anion. It is a conjugate base of a phenylalanine.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)CC(C(=O)[O-])N\\nThe natural language question is: The molecule is an organic heterohexacyclic compound and spirooxindole-type pyranopyrimidine spiro compound in which the shared atom of the spiro system is the carbon at position 3 of 1-allyl-7-fluoro-1,3-dihydro-2H-indol-2-one. It has a role as an antineoplastic agent. It is an organic heterohexacyclic compound, an organofluorine compound, a spiro compound and a member of oxindoles.\\nThe corresponding SMILES representation is:\\nCC1=NC2=C(C(=O)N1C(=O)C)[C@]3(C4=C(C(=CC=C4)F)N(C3=O)CC=C)C5=C(O2)C6=C(C=CC(=C6)F)OC5=O\\nThe natural language question is: The molecule is a member of the class of dibenzofurans that is dibenzo[b,d]furan substituted by methoxy groups at positions 6 and 9, hydroxy groups at positions 2 and 3 and a phenolic group at position 7. It has been isolated from Aspergillus taichungensis. It has a role as an Aspergillus metabolite. It is a member of dibenzofurans, a member of catechols and an aromatic ether.\\nThe corresponding SMILES representation is:\\nCOC1=C2C3=CC(=C(C=C3OC2=C(C(=C1)C4=CC=C(C=C4)O)OC)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a carbohydrate acid anion that is the conjugate base of cellobionic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a cellobionic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@H]([C@@H](CO)O)[C@@H]([C@H](C(=O)[O-])O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10643',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a nucleoside analogue obtained by formal cyclodimerisation of thymidine. It has a role as a Mycoplasma genitalium metabolite. It is a cyclobutadipyrimidine and a nucleoside analogue.\\nThe corresponding SMILES representation is:\\nCC12C(C3C1(C(=O)NC(=O)N3[C@H]4C[C@@H]([C@H](O4)CO)O)C)N(C(=O)NC2=O)[C@H]5C[C@@H]([C@H](O5)CO)O\\nThe natural language question is: The molecule is the pharmacologically active (6S)-stereoisomer of 5-formyltetrahydrofolic acid. It has a role as an antineoplastic agent and a metabolite. It is a conjugate acid of a (6S)-5-formyltetrahydrofolate(2-).\\nThe corresponding SMILES representation is:\\nC1[C@@H](N(C2=C(N1)N=C(NC2=O)N)C=O)CNC3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)O\\nThe natural language question is: The molecule is a monocarboxylic acid that is propionic acid in which one of the methyl hydrogens is substituted by a cyclopentyl group.\\nThe corresponding SMILES representation is:\\nC1CCC(C1)CCC(=O)O\\nThe natural language question is: The molecule is an ether having methyl and tert-butyl as the two alkyl components. It has a role as a non-polar solvent, a fuel additive and a metabolite.\\nThe corresponding SMILES representation is:\\nCC(C)(C)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a sialotriaosylceramide consisting of beta-D-GalNAc-(1->4)-[alpha-Neu5Ac-(2->3)]-beta-D-Gal-(1->4)-beta-D-Glc attached to the primary hydroxy function of ceramide(d18:1/24:1(15Z)). It has a role as a mouse metabolite. It derives from a (15Z)-tetracosenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCC/C=C/[C@H]([C@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@@]3(C[C@@H]([C@H]([C@@H](O3)[C@@H]([C@@H](CO)O)O)NC(=O)C)O)C(=O)O)O)O)O)NC(=O)CCCCCCCCCCCCC/C=C\\\\\\\\CCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27609',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an alpha-amino acid zwitterion obtained by transfer of a proton from the carboxy to the amino group of 5-bromotryptophan; major species at pH 7.3. It is a tautomer of a 5-bromotryptophan.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1Br)C(=CN2)CC(C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3279',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a polysaccharide derivative with a repeating unit consisting of beta-D-galactosyl, beta-D-galactosyl and beta-D-glucosyl residues linked sequentially (1->3) and (1->4), to the galactosyl residue at the non-reducing end of which is attached a beta-D-galactosyl-(1->4)-N-acetyl-beta-D-glucosaminyl disaccharide unit via a (1->3) linkage, with all repeating units being linked (1->6). Desialylated capsular polysaccharide of Streptococcus suis serotype 14.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@H]2[C@H]([C@H](O[C@H]([C@@H]2O)O[C@H]3[C@H]([C@H](O[C@H]([C@@H]3O)O[C@@H]4[C@H](O[C@H]([C@@H]([C@H]4O)O)O)CO)CO)O)CO)O)CO)O[C@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19821',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a tetrasaccharide derivative consisting of a dideoxy-4-{[4,5,6-trihydroxy-3-(hydroxymethyl)cyclohex-2-en-1-yl C7 cyclitol moiety [called valienol (or valienamine)] linked via nitrogen to isomaltotriose. It has a role as an EC 3.2.1.20 (alpha-glucosidase) inhibitor, an EC 3.2.1.1 (alpha-amylase) inhibitor and a hypoglycemic agent. It is a conjugate base of an acarbose(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O[C@@H]2[C@H](O[C@@H]([C@@H]([C@H]2O)O)O[C@@H]3[C@H](OC([C@@H]([C@H]3O)O)O)CO)CO)O)O)N[C@H]4C=C([C@H]([C@@H]([C@H]4O)O)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28303',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a ketohexose monophosphate that is D-tagatopyranose in which the phosphono substituent is located at position 1. It derives from a D-tagatopyranose. It is a conjugate acid of a D-tagatopyranose 1-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H]([C@@H](C(O1)(COP(=O)(O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9234',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a tetrapeptide composed of L-alanine, L-valine, L-aspartic acid, and L-tyrosine joined in sequence by peptide linkages. It has a role as a metabolite. It derives from a L-alanine, a L-valine, a L-aspartic acid and a L-tyrosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC1=CC=C(C=C1)O)C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5771',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is 4-Aminobenzoic acid in which one of the hydrogens ortho- to the carboxylic acid group is substituted by chlorine. It is an aminobenzoic acid and a member of monochlorobenzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1N)Cl)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27225',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a carboxamide resulting from the formal condensation of the carboxylic acid group of N-(isopropoxycarbonyl)valine with the amino group of methyl 3-amino-3-(4-chlorophenyl)propanoate. It is a carbamate ester, a valine derivative, a carboxamide, a member of monochlorobenzenes and a methyl ester.\\nThe corresponding SMILES representation is:\\nCC(C)C(C(=O)NC(CC(=O)OC)C1=CC=C(C=C1)Cl)NC(=O)OC(C)C\\nThe natural language question is: The molecule is a mannosylinositol phosphorylceramide having a hexacosanoyl group attached to the ceramide nitrogen, with no hydroxylation at C-4 of the long-chain base or on the very-long-chain fatty acid. It derives from an Ins-1-P-Cer(d18:0/26:0). It is a conjugate acid of a Man-1-2-Ins-1-P-Cer(d20:0/26:0)(1-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)(O)O[C@@H]1[C@@H]([C@@H]([C@H]([C@@H]([C@H]1OC2[C@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)O)O)O)[C@@H](CCCCCCCCCCCCCCCCC)O\\nThe natural language question is: The molecule is an N-(fatty acyl)-L-alpha-amino acid anion that is the conjugate base of N-propanoyl-L-methionine, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a N-(fatty acyl)-L-alpha-amino acid anion and a N-(fatty acyl)-L-methionine(1-). It is a conjugate base of a N-propanoyl-L-methionine.\\nThe corresponding SMILES representation is:\\nCCC(=O)N[C@@H](CCSC)C(=O)[O-]\\nThe natural language question is: The molecule is an alkyl alcohol that is 6-methyloctane carrying a hydroxy group at position 1 (the S-stereoisomer). It has a role as a Daphnia pulex metabolite. It is an alkyl alcohol and a primary alcohol.\\nThe corresponding SMILES representation is:\\nCC[C@H](C)CCCCCO\\nNext, you will be given a sample for test.The natural language question is: The molecule is a ginsenoside found in Panax notoginseng that is dammarane which is substituted by hydroxy groups at the 3beta, 12beta and 20 pro-S positions and in which the hydroxy group at position 20 has been converted to the corresponding alpha-L-arabinofuranosyl-beta-D-glucopyranoside. It has a role as a plant metabolite, a human xenobiotic metabolite and an antineoplastic agent. It is a beta-D-glucoside, a disaccharide derivative, a ginsenoside, a tetracyclic triterpenoid and a 3beta-hydroxy-4,4-dimethylsteroid. It derives from a (20S)-protopanaxadiol. It derives from a hydride of a dammarane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC[C@@](C)([C@H]1CC[C@@]2([C@@H]1[C@@H](C[C@H]3[C@]2(CC[C@@H]4[C@@]3(CC[C@@H](C4(C)C)O)C)C)O)C)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO[C@H]6[C@@H]([C@H]([C@@H](O6)CO)O)O)O)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3863',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an azaspiro compound resulting from the formal fusion of position 3 of 6-chloro-oxindole with position 3 of (2R,3SS5S)-3-(3-chloro-2-fluorophenyl)-5-(2,2-dimethylpropyl)-N-[2-(morpholin-4-yl)ethyl]pyrrolidine-2-carboxamide. It is a potent inhibitor of the MDM2-p53 interaction. It has a role as an apoptosis inducer. It is an azaspiro compound, a member of morpholines, a member of oxindoles, a member of pyrrolidines, a member of monochlorobenzenes, a member of monofluorobenzenes and a secondary carboxamide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C)C[C@@H]1[C@]2([C@H]([C@@H](N1)C(=O)NCCN3CCOCC3)C4=C(C(=CC=C4)Cl)F)C5=C(C=C(C=C5)Cl)NC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21551',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a disaccharide consisting of beta-D-glucopyranose and beta-D-arabinopyranose residues joined in sequence by a (1->3) glycosidic bond. It derives from a beta-D-glucose and a beta-D-arabinopyranose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@H]([C@@H]([C@@H](O1)O)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11937',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 3-acyl-sn-glycerol where arachidonoyl is the 3-acyl group. It is a 1-arachidonoylglycerol and a 3-acyl-sn-glycerol. It is an enantiomer of a 1-arachidonoyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)OC[C@@H](CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21052',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a tricyclic hydrocarbon and sesquiterpene that is octahydro-1H-3a,6-methanoazulene which is substituted by a methylidene group at position 7 and by methyl groups at positions 3, 8, and 8. It is a sesquiterpene and a tricyclic hydrocarbon.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1CC[C@H]2[C@]13CCC(C3)C(=C)C2(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3219',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a 3-hydroxyaspartic acid that has R configuration at the carbon bearing the amino group. It has a role as a fungal metabolite. It is a D-aspartic acid derivative, a D-alpha-amino acid and a 3-hydroxyaspartic acid. It is an enantiomer of a 3-hydroxy-L-aspartic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@@H](C(C(=O)O)O)(C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7946',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a methyl ketone that is alpha-ionone in which a hydrogen at position 5 of the cyclohex-2-en-1-yl ring is substituted by a methyl group. It is a methyl ketone and an enone. It derives from an alpha-ionone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1CC=C(C(C1(C)C)/C=C/C(=O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13355',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an N-sulfonylurea that is 1-tosylurea in which a hydrogen attached to the nitrogen at position 3 is replaced by an azepan-1-yl group. A hypoglycemic agent, it is used for the treatment of type 2 diabetes mellitus. It has a role as a hypoglycemic agent and a potassium channel blocker.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC=C(C=C1)S(=O)(=O)NC(=O)NN2CCCCCC2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6346',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a very long-chain omega-3 fatty acid that is tetratriacontapentaenoic acid having five double bonds located at positions 19, 22, 25 ,28 and 31 (the 19Z,22Z,25Z,28Z,31Z-isomer). It is an omega-3 fatty acid and a tetratriacontapentaenoic acid. It is a conjugate acid of a (19Z,22Z,25Z,28Z,31Z)-tetratriacontapentaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22249',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of ferulic acid. It derives from a coenzyme A and a ferulic acid. It is a conjugate acid of a feruloyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)/C=C/C4=CC(=C(C=C4)O)OC)O\\nThe natural language question is: The molecule is one of the primary forms of gastrin that is a 34-membered peptide consisting of Gln, Leu, Gly, Pro, Gln, Gly, Pro, Pro, His, Leu, Val, Ala, Asp, Pro, Ser, Lys, Lys, Gln, Gly, Pro, Trp, Leu, Glu, Glu, Glu, Glu, Glu, Ala, Tyr, Gly, Trp, Met, Asp and Phe residues joined in sequence.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@@H](CC1=CC=C(C=C1)O)C(=O)NCC(=O)N[C@@H](CC2=CNC3=CC=CC=C32)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC4=CC=CC=C4)C(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC5=CNC6=CC=CC=C65)NC(=O)[C@@H]7CCCN7C(=O)CNC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CO)NC(=O)[C@@H]8CCCN8C(=O)[C@H](CC(=O)O)NC(=O)[C@H](C)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC9=CNC=N9)NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CCCN1C(=O)CNC(=O)[C@H](CCC(=O)N)NC(=O)[C@@H]1CCCN1C(=O)CNC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)N)N\\nThe natural language question is: The molecule is an imidothiocarbamic ester, a member of indoles and a member of maleimides. It has a role as an EC 2.7.11.13 (protein kinase C) inhibitor. It derives from a maleimide.\\nThe corresponding SMILES representation is:\\nCN1C=C(C2=CC=CC=C21)C3=C(C(=O)NC3=O)C4=CN(C5=CC=CC=C54)CCCSC(=N)N\\nThe natural language question is: The molecule is the organophosphate oxoanion obtained by deprotonation of the diphosphate hydroxy groups of the reduced form of flavin adenine dinucleotide (FADH2). It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a FADH2.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C=C1C)N(C3=C(N2)C(=O)NC(=O)N3)C[C@@H]([C@@H]([C@@H](COP(=O)([O-])OP(=O)([O-])OC[C@@H]4[C@H]([C@H]([C@@H](O4)N5C=NC6=C(N=CN=C65)N)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the hydrochloride salt of ketamine. It has a role as an analgesic, a NMDA receptor antagonist and an intravenous anaesthetic. It contains a ketamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[NH2+]C1(CCCCC1=O)C2=CC=CC=C2Cl.[Cl-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2540',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a butan-4-olide that is tetrahydrofuran substituted by an oxo group at position 2. It has a role as a neurotoxin and a metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC(=O)OC1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5660',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a dipeptide obtained by formal condensation of the carboxy group of L-aspartic acid with the amino group of L-methionine. It derives from a L-aspartic acid and a L-methionine.\\nThe corresponding SMILES representation is:\\nCSCC[C@@H](C(=O)O)NC(=O)[C@H](CC(=O)O)N\\nThe natural language question is: The molecule is the D-enantiomer of homoserine. It is a homoserine and a D-alpha-amino acid. It is an enantiomer of a L-homoserine. It is a tautomer of a D-homoserine zwitterion.\\nThe corresponding SMILES representation is:\\nC(CO)[C@H](C(=O)O)N\\nThe natural language question is: The molecule is a palmitate ester resulting from the formal condensation of the carboxy group of palmitic acid with the hydroxy group of octan-1-ol. It has a role as a bacterial metabolite. It is a hexadecanoate ester and a wax ester. It derives from an octan-1-ol.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OCCCCCCCC\\nThe natural language question is: The molecule is the organic anion resulting from the removal of a proton from the hydrogen-bearing nitrogen atom of dantrolene. It is a conjugate base of a dantrolene.\\nThe corresponding SMILES representation is:\\nC1C(=NC(=O)N1N=CC2=CC=C(O2)C3=CC=C(C=C3)[N+](=O)[O-])[O-]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a very long-chain omega-3 fatty acid that is triacontapentaenoic acid having five double bonds located at positions 15, 18, 21, 24 and 27 (the 15Z,18Z,21Z,24Z,27Z-isomer). It is an omega-3 fatty acid and a triacontapentaenoic acid. It is a conjugate acid of a (15Z,18Z,21Z,24Z,27Z)-triacontapentaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18465',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a cytochalasan alkaloid found in Chaetomium globosum and Chaetomium subaffine. It has a role as an antineoplastic agent and a Chaetomium metabolite. It is a cytochalasan alkaloid, a member of indoles, a macrocycle, an epoxide and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\nC[C@H]\\\\\\\\1C/C=C/[C@H]2[C@H]3[C@](O3)([C@H]([C@@H]4[C@@]2(C(=O)CC[C@@H](C(=O)/C(=C1)/C)O)C(=O)N[C@H]4CC5=CNC6=CC=CC=C65)C)C\\nThe natural language question is: The molecule is an N-acylsphingosine in which the ceramide N-acyl group is specified as 30-[(9Z,12Z)-octadeca-9,12-dienoyloxy]triacontanoyl. It is a N-acylsphingosine and an omega-linoleoyloxy-O-ultra-long chain acylceramide.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCC/C=C/[C@H]([C@H](CO)NC(=O)CCCCCCCCCCCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC)O\\nThe natural language question is: The molecule is an acetate ester obtained by the formal condensation of acetic acid with propanol. It has a role as a fragrance and a plant metabolite. It derives from a propan-1-ol.\\nThe corresponding SMILES representation is:\\nCCCOC(=O)C\\nThe natural language question is: The molecule is the straight-chain keto form of D-fructuronic acid. It derives from a keto-D-fructose. It is a conjugate acid of a keto-D-fructuronate.\\nThe corresponding SMILES representation is:\\nC(C(=O)[C@H]([C@@H]([C@@H](C(=O)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the alpha-anomer of N-acetyl-D-galactosamine 1-phosphate. It has a role as an Escherichia coli metabolite. It is a conjugate acid of a N-acetyl-alpha-D-galactosamine 1-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1OP(=O)(O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4712',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an oligosaccharide derivative that is a tridecasaccharide derivative, the oligosaccharide portion of the Proteus penneri strain 12 lipopolysaccharide (LPS) core region. The 2-amino-2-deoxy-beta-D-galacturonic acid (beta-D-GalAN) residue may or may not be present.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@@H]2[C@H](O[C@@H]([C@@H]([C@H]2O)N)O[C@@H]3[C@@H]([C@H]([C@H](O[C@@H]3C(=O)O)O[C@H]4[C@@H]([C@H](O[C@@H]([C@H]4O)O[C@@H]5[C@@H]([C@H](O[C@@H]([C@H]5O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)[C@H](CO)O)O[C@@H]7[C@@H](C[C@@](O[C@@H]7[C@@H](CO[C@@H]8[C@@H]([C@H]([C@H](CO8)N)O)O)O)(C(=O)O)O)O[C@@]9(C[C@H]([C@H]([C@H](O9)[C@@H](CO)O)O)O)C(=O)O)O)[C@H](CO[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@H](CO[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)C(=O)O)O)O)N)O)O)O)O)OP(=O)(O)OCCN)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@@H](CO)O)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@H](CO)O)O)O)O)O)CO)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26389',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a tertiary amino compound that consists of glycine bearing two N-phosphonomethyl substituents. It has a role as a plant growth retardant. It is a glycine derivative, a member of phosphonic acids and a tertiary amino compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(=O)O)N(CP(=O)(O)O)CP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29039',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organic potassium salt, a cyanine dye, an organosulfonate salt, an organoiodine compound, a secondary carboxamide, an organic heterotricyclic compound and a member of indoles. It has a role as a fluorochrome.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1(C2=C(C=CC(=C2)NC(=O)CI)[N+](=C1/C=C/C=C/C=C/3\\\\\\\\C(C4=C(N3CCS(=O)(=O)[O-])C=CC5=C4C=C(C=C5S(=O)(=O)[O-])S(=O)(=O)[O-])(C)C)C)C.[K+].[K+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1510',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 5-alkylresorcinol in which the alkyl group is specified as methyl. It has a role as an Aspergillus metabolite. It is a 5-alkylresorcinol and a dihydroxytoluene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=CC(=C1)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23483',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a propanoate ester resulting from the formal condensation of the hydroxy group of propanol with the carboxy group of propanoic acid. It has a role as a human metabolite, a rat metabolite, a biomarker and a fungal metabolite. It derives from a propan-1-ol.\\nThe corresponding SMILES representation is:\\nCCCOC(=O)CC\\nThe natural language question is: The molecule is the monoazo compound formed from arsanilic acid. It is used as an immunologic research tool. It has a role as a hapten and an allergen. It is a monoazo compound and an organoarsonic acid. It derives from an arsanilic acid.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1N=NC2=CC=C(C=C2)[As](=O)(O)O)[As](=O)(O)O\\nThe natural language question is: The molecule is an amino trisaccharide consisting of 4,6-dideoxy-alpha-L-xylo-hexopyranose, beta-D-galactopyranose and 2-acetamido-2-deoxy-D-glucopyranose residues joined in sequence by (1->2) and (1->4) glycosidic bonds. It is an amino trisaccharide and a member of acetamides. It derives from a beta-D-Galp-(1->4)-D-GlcpNAc.\\nThe corresponding SMILES representation is:\\nC[C@H]1C[C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@H]2O[C@@H]3[C@H](OC([C@@H]([C@H]3O)NC(=O)C)O)CO)CO)O)O)O)O\\nThe natural language question is: The molecule is a diterpenoid of the clerodane group isolated from the bark of Casearia grewiifolia and has been shown to exhibit antimalarial and antimycobacterial activity. It has a role as a metabolite, an antimalarial and an antimycobacterial drug. It is an organic heterotricyclic compound, an acetate ester, a diterpenoid and a cyclic ether.\\nThe corresponding SMILES representation is:\\nC[C@H]1C[C@H]([C@@]23[C@@H]([C@@]1(C)CCC(=C)C=C)C[C@@H](C=C2[C@@H](O[C@@H]3OC(=O)C)OC(=O)C)OC(=O)CC(C)C)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is the trichloride salt of ytterbium(III). It has a role as a NMR shift reagent. It contains a ytterbium(3+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[Cl-].[Cl-].[Cl-].[Yb+3]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22445',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a monocarboxylic acid amide obtained by the formal condensation of propionic acid with ammonia. It is a monocarboxylic acid amide and a primary fatty amide. It derives from a propionic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12242',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an aralkylamino compound that is benzylamine substituted by a methoxy group at the para position. It is a primary amino compound, an aromatic ether and an aralkylamino compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC=C(C=C1)CN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24786',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of the class of benzimidazoles that is benzimidalole which is substituted at position 2 by a (methoxycarbonyl)amino group and at position 5 by a 2-thienoyl group. It is an antineoplastic agent that exerts its effect by depolymerising microtubules. It has a role as an antineoplastic agent, a tubulin modulator, an antimitotic and a microtubule-destabilising agent. It is a member of thiophenes, a member of benzimidazoles, a carbamate ester and an aromatic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=CS3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5849',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 4-(diethylamino)but-2-yn-1-ol yhat has S configuration. In contrast to the (R)- enantiomer, esoxybutynin exhibits essentially no anticholinergic activity. It has a role as a muscarinic antagonist, a local anaesthetic and a calcium channel blocker. It is an enantiomer of a (R)-oxybutynin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN(CC)CC#CCOC(=O)[C@](C1CCCCC1)(C2=CC=CC=C2)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16802',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a pyranone that is 4-hydroxy-3-methyl-2H-pyran-2-one in which two of the hydrogens of the methyl group are replaced by a cycloprop-2-en-1-yl group and a 3-{[N-(tert-butoxycarbonyl)-beta-alanyl]amino}phenyl group (S-configuration) and in which the hydrogen at position 6 is replaced by a 1-phenylbutan-2-yl group (R-configuration). It has a role as a HIV protease inhibitor. It is a member of cyclopropenes, a carbamate ester, an anilide, a monocarboxylic acid amide and a member of 2-pyranones.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](CC1=CC=CC=C1)C2=CC(=C(C(=O)O2)[C@@H](C3C=C3)C4=CC(=CC=C4)NC(=O)CCNC(=O)OC(C)(C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24066',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an oxygen hydride. It has a role as a mouse metabolite. It is a conjugate base of a water.\\nThe corresponding SMILES representation is:\\n[OH-]\\nThe natural language question is: The molecule is a medium-chain unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (3E,5Z)-dodecadienoic acid. It is a (3E,5Z)-dienoyl-CoA, a medium-chain fatty acyl-CoA and an unsaturated fatty acyl-CoA. It is a conjugate acid of a (3E,5Z)-dodecadienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCCC/C=C\\\\\\\\C=C\\\\\\\\CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is a steroid acid anion that is the conjugate base of (25R)-3beta-hydroxycholest-5-en-7-one-26-oic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It has a role as a human xenobiotic metabolite. It is a conjugate base of a (25R)-3beta-hydroxycholest-5-en-7-one-26-oic acid.\\nThe corresponding SMILES representation is:\\nC[C@H](CCC[C@@H](C)C(=O)[O-])[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2C(=O)C=C4[C@@]3(CC[C@@H](C4)O)C)C\\nThe natural language question is: The molecule is a flavin mononucleotide that is FMN in which the 8-methyl group has been oxidised to the corresponding aldehyde. It is a flavin mononucleotide, a ribitol phosphate and an arenecarbaldehyde. It derives from a FMN. It is a conjugate acid of an 8-formyl-8-demethylriboflavin 5'-phosphate(3-).\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C=C1C=O)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]([C@@H]([C@@H](COP(=O)(O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a dipeptide zwitterion resulting from transfer of a proton from the carboxy to the amino group of Cys(IAN)-Gly; major species at pH 7.3. It is a tautomer of a Cys(IAN)-Gly.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC=C2C(=C1)C(=CN2)C(C#N)SC[C@@H](C(=O)NCC(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8694',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a triterpenoid saponin isolated from the roots of of the Madagascan plant Albizia gummifera and has been shown to exhibit cytotoxicity against human ovarian cancer cell line. It has a role as an antineoplastic agent and a plant metabolite. It is an enoate ester, a pentacyclic triterpenoid and a triterpenoid saponin. It derives from a hydride of an oleanane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O[C@H]3CC[C@]4([C@H](C3(C)C)CC[C@@]5([C@@H]4CC=C6[C@]5(C[C@H]([C@@]7([C@H]6CC([C@H](C7)OC(=O)/C(=C/CC[C@@](C)(C=C)O[C@H]8[C@@H]([C@H]([C@@H]([C@H](O8)C)OC(=O)/C(=C/CC[C@](C)(C=C)O[C@H]9[C@@H]([C@H]([C@@H]([C@H](O9)C)OC(=O)/C(=C/CC[C@](C)(C=C)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)C)O)O)O)/C)O)O)/C)O)O)/C)(C)C)C(=O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)C)O[C@H]1[C@@H]([C@H]([C@@H](CO1)O)O)O)O)O)O)C)C)C)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7810',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a cholestanoid that is cholest-4-en-3-one which carries a hydroxy group at position 25. It has a role as a bacterial metabolite. It is a 3-oxo-Delta(4) steroid, a cholestanoid and a 25-hydroxy steroid. It derives from a cholest-4-en-3-one.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCC(C)(C)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CCC4=CC(=O)CC[C@]34C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16429',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a linear tetrapyrrole, product of heme degradation. An isomer of bilirubin. It has a role as a metabolite. It is a member of biladienes and a dicarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(NC(=C1CCC(=O)O)CC2=C(C(=C(N2)/C=C\\\\\\\\3/C(=C(C(=O)N3)C)C=C)C)CCC(=O)O)/C=C/4\\\\\\\\C(=C(C(=O)N4)C=C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9697',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 4-pyranone with a 2,3-double bond carrying a hydroxy group at position 3 and a hydroxymethyl group at position 6. It is a deoxyketohexose and an anhydrohexose. It is a conjugate acid of an ascopyrone P(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H](OC=C(C1=O)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24950',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an organic heterobicyclic compound that is a fusion product between benzene and thiazole. The parent of the class of benzothiazoles. It has a role as a plant metabolite, a xenobiotic and an environmental contaminant.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)N=CS2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28426',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a GDP-sugar having 4-dehydro-6-deoxy-alpha-D-mannose as the sugar portion. It has a role as an Escherichia coli metabolite and a mouse metabolite. It derives from a GDP-alpha-D-mannose. It is a conjugate acid of a GDP-4-dehydro-6-deoxy-alpha-D-mannose(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C(=O)[C@@H]([C@@H]([C@H](O1)OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C3N=C(NC4=O)N)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_355',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a 3beta-sterol consisting of an ergostane skeleton with double bonds at 7- and 22-positions. It has a role as a metabolite, an anti-HSV-1 agent, an EC 3.2.1.18 (exo-alpha-sialidase) inhibitor and an antifungal agent. It derives from a hydride of a 5alpha-ergostane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](/C=C/[C@H](C)C(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(CC[C@@H](C4)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7294',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a (6S)-vomifoliol with a R configuration for the hydroxy group at position 9. It has a role as a phytotoxin and a metabolite. It is an enantiomer of a (6R,9S)-vomifoliol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=O)CC([C@]1(/C=C/[C@@H](C)O)O)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11408',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 4-hydroxy-3-polyprenylbenzoate in which the polyprenyl chain contains 9 prenyl units; major species at pH 7.3. It is a conjugate base of a 4-hydroxy-3-all-trans-nonaprenylbenzoic acid.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C/CC1=C(C=CC(=C1)C(=O)O)[O-])/C)/C)/C)/C)/C)/C)/C)/C)C\\nThe natural language question is: The molecule is an abietane diterpenoid isolated from the stem bark of Fraxinus sieboldiana. It has a role as a plant metabolite. It is a diterpene lactone, an abietane diterpenoid and a tetracyclic diterpenoid.\\nThe corresponding SMILES representation is:\\nCC(C)C1=C(C=C2C(=C1)[C@H]([C@@H]3[C@@H]4[C@@]2(CCCC4(C)C)C(=O)O3)O)O\\nThe natural language question is: The molecule is a mannotriose that is alpha-D-mannopyranose in which the hydroxy groups at positions 2 and 4 have each been converted into the corresponding alpha-D-mannopyranoside. It derives from an alpha-D-Manp-(1->4)-alpha-D-Manp and an alpha-D-Manp-(1->2)-alpha-D-Manp.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O[C@@H]2[C@H](O[C@@H]([C@H]([C@H]2O)O[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)CO)O)O)O)O\\nThe natural language question is: The molecule is an alpha-D-galactosyl-(1->4)-beta-D-galactosyl-(1->4)-beta-D-glucosylceramide in which the ceramide N-acyl group is specified as (17Z)-hexacosenoyl. It has a role as a human blood serum metabolite.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCC/C=C/[C@H]([C@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O)O)O)NC(=O)CCCCCCCCCCCCCCC/C=C\\\\\\\\CCCCCCCC)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydroxyicosapentaenoic acid that consists of 6E,8Z,11Z,14Z,17Z-icosapentaenoic acid with the hydroxy group located at position 5. It has a role as a mouse metabolite. It is a conjugate acid of a 5-HEPE(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C(CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20971',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a dinitrile that is tert-butylbenzene in which the hydrogen at the para- position is substituted by a 4,4-dicyano-2-methylbuta-1,3-dien-1-yl group (the trans isomer). It is used as a matrix in matrix-assisted laser desorption/ionization (MALDI) mass spectrometry. It has a role as a MALDI matrix material.\\nThe corresponding SMILES representation is:\\nC/C(=C\\\\\\\\C1=CC=C(C=C1)C(C)(C)C)/C=C(C#N)C#N\\nThe natural language question is: The molecule is a nitroso compound that is triazane in which the the nitrogen at position 1 is substituted by two 2-aminoethyl groups, that at position 2 is substituted by a hydroxy group, and that at position 3 is substituted by an oxo group. It has a role as a nitric oxide donor. It is a tertiary amino compound and a nitroso compound. It derives from a hydride of a triazane.\\nThe corresponding SMILES representation is:\\nC(CN(CCN)/[N+](=N/O)/[O-])N\\nThe natural language question is: The molecule is a organic heterotetracyclic compound that is a red pigment obtained from the wood of Caesalpinia echinata (Brazil-wood) or Caesalpinia sappan (sappan-wood). It has a role as a plant metabolite, a histological dye, an antineoplastic agent, a biological pigment, an anti-inflammatory agent, an apoptosis inducer, an antioxidant, an antibacterial agent, a NF-kappaB inhibitor and a hepatoprotective agent. It is an organic heterotetracyclic compound, a member of catechols and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\nC1C2=CC(=C(C=C2[C@H]3[C@@]1(COC4=C3C=CC(=C4)O)O)O)O\\nThe natural language question is: The molecule is a 1-alkyl-sn-glycero-3-phosphocholine in which the alkyl group is specified as (11Z)-octadecenyl. It has a role as a human xenobiotic metabolite. It is a lysophosphatidylcholine O-18:1 and a 1-alkyl-sn-glycero-3-phosphocholine.\\nThe corresponding SMILES representation is:\\nCCCCCC/C=C\\\\\\\\CCCCCCCCCCOC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a D-fructofuranose 1,6-bisphosphate with a beta-configuration at the anomeric position. It has a role as a mouse metabolite. It derives from a beta-D-fructofuranose. It is a conjugate acid of a beta-D-fructofuranose 1,6-bisphosphate(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@](O1)(COP(=O)(O)O)O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14387',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an N(4)-acetylcytidine 5'-monophosphate that results from the removal of two protons from the phosphate group; major species at pH 7.3.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=O)NC1=NC(=O)N(C=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24578',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a 1-(alk-1-enyl)-2-acyl-sn-glycero-3-phosphate in which the alk-1-enyl and acyl groups are specified as (1Z)-octadecenyl and oleoyl respectively. It derives from an oleic acid. It is a conjugate acid of a 1-[(1Z)-octadecenyl]-2-oleoyl-sn-glycero-3-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC/C=C\\\\\\\\OC[C@H](COP(=O)(O)O)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18434',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the stable isotope of zinc with relative atomic mass 66.927131, 4.10 atom percent natural abundance and nuclear spin 5/2.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[67Zn]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2148',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organosulfonate oxoanion which is also a monocarboxylic acid anion obtained by deprotonation of the sulfo and carboxy groups of tartrazine acid. It is a monocarboxylic acid anion and an organosulfonate oxoanion. It is a conjugate base of a tartrazine acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1N=NC2C(=NN(C2=O)C3=CC=C(C=C3)S(=O)(=O)[O-])C(=O)[O-])S(=O)(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9042',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a cytochalasan alkaloid found in Chaetomium globosum.( Compound class : cytochalasan alkaloid) It has a role as a Chaetomium metabolite. It is a cytochalasan alkaloid, a member of indoles, a macrocycle and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]\\\\\\\\1C/C=C/[C@H]2[C@@H]([C@@]([C@H]([C@@H]3[C@@]2(C(=O)/C=C/C(=O)[C@@H](/C(=C1)/C)O)C(=O)N[C@H]3CC4=CNC5=CC=CC=C54)C)(C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13162',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an oxindole that is 3-methyleneoxindole in which one of the hydrogens of the methylene group is substituted by a 3-(2-carboxyethyl)-4-methyl-1H-pyrrol-2-yl group. It is an ATP-competitive inhibitor of the tyrosine kinase activity of fibroblast growth factor receptor 1. It has a role as a fibroblast growth factor receptor antagonist. It is a monocarboxylic acid, a member of pyrroles and a member of oxindoles. It derives from a 3-methyleneoxindole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CNC(=C1CCC(=O)O)/C=C\\\\\\\\2/C3=CC=CC=C3NC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4996',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of the class of quinolines that is the carboxamide of 4-{3-chloro-4-[(cyclopropylcarbamoyl)amino]phenoxy}-7-methoxyquinoline-6-carboxylic acid. A multi-kinase inhibitor and orphan drug used (as its mesylate salt) for the treatment of various types of thyroid cancer that do not respond to radioiodine. It has a role as a vascular endothelial growth factor receptor antagonist, an orphan drug, an antineoplastic agent, an EC 2.7.10.1 (receptor protein-tyrosine kinase) inhibitor and a fibroblast growth factor receptor antagonist. It is a member of quinolines, an aromatic ether, a monocarboxylic acid amide, an aromatic amide, a member of monochlorobenzenes, a member of cyclopropanes and a member of phenylureas. It is a conjugate base of a lenvatinib(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC2=NC=CC(=C2C=C1C(=O)N)OC3=CC(=C(C=C3)NC(=O)NC4CC4)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14570',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a citronellol that is oct-6-ene substituted by a hydroxy group at position 1 and methyl groups at positions 3 and 7 (the 3R-enantiomer). It is an enantiomer of a (S)-(-)-citronellol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC=C(C)C)CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27880',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a C16, monounsaturated fatty acid with a double bond at position 11; a key intermediate in silkworm pheromone biosynthesis.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC/C=C\\\\\\\\CCCCCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16816',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a glucotetrose consisting of three alpha-D-glucopyranosyl residues and a beta-D-glucopyransyl residue joined in sequence by (1->6) glycosidic bonds. It derives from an alpha-D-Glcp-(1->6)-alpha-D-Glcp-(1->6)-alpha-D-Glcp and an alpha-D-Glcp-(1->6)-alpha-D-Glcp-(1->6)-beta-D-Glcp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@H](O2)OC[C@@H]3[C@H]([C@@H]([C@H]([C@H](O3)OC[C@@H]4[C@H]([C@@H]([C@H]([C@@H](O4)O)O)O)O)O)O)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11372',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a DiHETE(1-) that is the conjugate base of 14(R),15(S)-DiHETE arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a dihydroxyicosatetraenoate and a long-chain fatty acid anion. It is a conjugate base of a 14(R),15(S)-DiHETE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@@H]([C@@H](/C=C/C=C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12765',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of benzoic acids that is salicylic acid in which the hydrogen that is attached to the phenolic hydroxy group has been replaced by an acetoxy group. A non-steroidal anti-inflammatory drug with cyclooxygenase inhibitor activity. It has a role as a non-steroidal anti-inflammatory drug, a non-narcotic analgesic, a platelet aggregation inhibitor, an antipyretic, a cyclooxygenase 2 inhibitor, a cyclooxygenase 1 inhibitor, a prostaglandin antagonist, a teratogenic agent, an anticoagulant, a plant activator, an EC 1.1.1.188 (prostaglandin-F synthase) inhibitor and a drug allergen. It is a member of benzoic acids, a member of salicylates and a member of phenyl acetates. It derives from a salicylic acid. It is a conjugate acid of an acetylsalicylate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)OC1=CC=CC=C1C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7137',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a fumonisin that is fumonisin B2 that is lacking hydroxy group located gamma- to the amino substituent. It has a role as an Aspergillus metabolite. It is a fumonisin, a primary amino compound, a secondary alcohol and a diester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC[C@@H](C)[C@H]([C@H](C[C@@H](C)CCCCCCCC[C@@H]([C@H](C)N)O)OC(=O)C[C@@H](CC(=O)O)C(=O)O)OC(=O)C[C@@H](CC(=O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6631',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 3-methylbut-2-enoic acid. It has a role as a mouse metabolite. It is a short-chain fatty acyl-CoA, a methyl-branched fatty acyl-CoA, a 2-enoyl-CoA and a monounsaturated fatty acyl-CoA. It derives from a but-2-enoyl-CoA and a 3-methylbut-2-enoic acid. It is a conjugate acid of a 3-methylbut-2-enoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24672',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a member of the class of furans that is furan which is substituted at positions 2 and 5 by formyl and hydroxymethyl substituents, respectively. Virtually absent from fresh foods, it is naturally generated in sugar-containing foods during storage, and especially by drying or cooking. It is the causative component in honey that affects the presystemic metabolism and pharmacokinetics of GZ in-vivo. It has a role as an indicator and a Maillard reaction product. It is a member of furans, an arenecarbaldehyde and a primary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(OC(=C1)C=O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19349',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a N-acylsphingosine in which the ceramide N-acyl group is specified as tricosanoyl. It has a role as a mouse metabolite. It is a N-acylsphingosine and a Cer(d41:1). It derives from a tricosanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@@H](/C=C/CCCCCCCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14877',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an aminopyrimidine that is pyrimidine-2,4-diamine which is substituted at position 5 by a p-chlorophenyl group and at position 6 by an ethyl group. It is a folic acid antagonist used as an antimalarial or with a sulfonamide to treat toxoplasmosis. It has a role as an antimalarial, an EC 1.5.1.3 (dihydrofolate reductase) inhibitor and an antiprotozoal drug. It is an aminopyrimidine and a member of monochlorobenzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=C(C(=NC(=N1)N)N)C2=CC=C(C=C2)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2521',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a glycophytoceramide having an alpha-D-galactopyranosyl residue at the O-1 position and a dodecanoyl group attached to the nitrogen. It derives from an alpha-D-galactose and a dodecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCC[C@H]([C@H]([C@H](CO[C@@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)NC(=O)CCCCCCCCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22848',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 1-phosphatidyl-1D-myo-inositol 3,5-bisphosphate(5-) arising from deprotonation of all five free phosphate OH groups of 1,2-dipalmitoyl-sn-glycero-3-phospho-(1'-D-myo-inositol-3',5'-bisphosphate); major species at pH 7.3. It is a conjugate base of a 1,2-dipalmitoyl-sn-glycero-3-phospho-(1'D-myo-inositol-3',5'-bisphosphate).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OC1[C@@H]([C@H](C([C@H]([C@H]1O)OP(=O)([O-])[O-])O)OP(=O)([O-])[O-])O)OC(=O)CCCCCCCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18168',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a hydroxamic acid anion resulting from the removal of a proton from each of the hydroxamic acid groups of desferrialbomycin delta2. It is a conjugate base of a desferrialbomycin delta2.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N(CCC[C@@H](C(=O)N[C@@H](CCCN(C(=O)C)[O-])C(=O)N[C@@H](CCCN(C(=O)C)[O-])C(=O)N[C@@H](CO)C(=O)N[C@H]([C@@H]([C@@H]1[C@@H]([C@H]([C@@H](S1)N2C=C/C(=N\\\\\\\\C(=O)N)/N(C2=O)C)O)O)O)C(=O)O)N)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20560',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a biflavonoid that is apigenin substituted by a 4-(5,7-dihydroxy-4-oxo-4H-chromen-2-yl)phenoxy group at position 6. A diflavonyl ether, it is isolated from Rhus succedanea and has been found to possess significant cytotoxic potential. It has a role as a neuroprotective agent, an antineoplastic agent and a metabolite. It is a biflavonoid, an aromatic ether and a hydroxyflavone. It derives from an apigenin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)OC4=CC=C(C=C4)C5=CC(=O)C6=C(C=C(C=C6O5)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4339',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an eight-membered oligopeptide comprising Arg, Pro, Pro, Gly, Phe, Ser, Pro and Phe residues joined in sequence. It is an analogue of bradykinin lacking the Arg residue at position 9. It has a role as a bradykinin receptor B2 agonist. It is a conjugate base of a [des-Arg(9)]-bradykinin(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[C@H](N(C1)C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCN=C(N)N)N)C(=O)NCC(=O)N[C@@H](CC3=CC=CC=C3)C(=O)N[C@@H](CO)C(=O)N4CCC[C@H]4C(=O)N[C@@H](CC5=CC=CC=C5)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19744',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the L-enantiomer of isoleucine. It has a role as a Saccharomyces cerevisiae metabolite, an Escherichia coli metabolite, a plant metabolite, a human metabolite, an algal metabolite and a mouse metabolite. It is an aspartate family amino acid, a proteinogenic amino acid, an isoleucine and a L-alpha-amino acid. It is a conjugate base of a L-isoleucinium. It is a conjugate acid of a L-isoleucinate. It is an enantiomer of a D-isoleucine. It is a tautomer of a L-isoleucine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11258',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a 5-oxo monocarboxylic acid anion obtained by deprotonation of the carboxy group of any diastereomer of jasmonic acid; major species at pH 7.3. It has a role as a member of jasmonates.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\CC1C(CCC1=O)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13026',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an octadecadienoyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (6Z,11E)-octadecadienoic acid. It is a conjugate acid of a (6Z,11E)-octadecadienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C/CCC/C=C\\\\\\\\CCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8973',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an (omega-1)-hydroxy fatty acid that is the conjugate base of 5-hydroxyhexanoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is an (omega-1)-hydroxy fatty acid anion, a medium-chain fatty acid anion and a hydroxy saturated fatty acid anion. It is a conjugate base of a 5-hydroxyhexanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCCC(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21807',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a cytochalasan alkaloid found in Chaetomium globosum and Chaetomium subaffine. It has a role as a Chaetomium metabolite. It is a cytochalasan alkaloid, an epoxide, a member of indoles and a macrocycle.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]\\\\\\\\1C/C=C/[C@H]2C3[C@](O3)([C@H]([C@@H]4[C@@]2(C(=O)CCC(=O)C(=O)/C(=C1)/C)C(=O)N[C@H]4CC5=CNC6=CC=CC=C65)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25226',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is the radioactive isotope of hydrogen with relative atomic mass 3.016049 and half-life of 12.33 years (from Greek taurhoiotatauomicronsigma, third). It contains a triton.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[3HH]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28743',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an oligosaccharide sulfate that is 2-acetamido-2-deoxy-6-O-sulfo-D-galactopyranose in which the hydroxy group at position 3 has been converted into the corresponding 2-O-sulfo-alpha-L-threo-hex-4-enopyranuronoxyl derivative. It is an oligosaccharide sulfate, an amino disaccharide, a monocarboxylic acid, a member of acetamides and an enol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](OC1O)COS(=O)(=O)O)O)O[C@H]2[C@@H]([C@H](C(=C(O2)C(=O)O)O)O)OS(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27240',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a complex heterocyclic sulfonium compound with an imidazolium core, used to treat hypertension. It has a role as a vasodilator agent, an antihypertensive agent, an anaesthesia adjuvant and a nicotinic antagonist.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC2C3C(C[S+]2C1)N(C(=O)N3CC4=CC=CC=C4)CC5=CC=CC=C5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17122',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a depsipeptide isolated from Jaspis splendens. It has a role as an antineoplastic agent, an animal metabolite and a marine metabolite. It is a depsipeptide, a member of indoles, an organobromine compound, a member of catechols and a carboxylic ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](C[C@H](C)OC(=O)CC(=O)C1=CC(=C(C=C1)O)O)/C=C(\\\\\\\\C)/C[C@H](C)C(=O)N[C@@H](C)C(=O)N(C)[C@H](CC2=C(NC3=CC=CC=C32)Br)C(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13818',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxy monocarboxylic acid anion resulting from the deprotonation of the carboxy group of 4-O-beta-D-glucosyl-trans-caffeic acid. The major specides at pH 7.3. It derives from a trans-caffeate. It is a conjugate base of a 4-O-beta-D-glucosyl-trans-caffeic acid.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C=C1/C=C/C(=O)O)[O-])O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O\\nThe natural language question is: The molecule is an artemoin in which the two hydroxy groups on the C-30 side-chain are located at positions 19 and 20. It has a role as a mouse metabolite, a plant metabolite and a rat metabolite.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCC(C(CCCCCCCCCCCCCCCCCCC1=CC(OC1=O)C)O)O\\nThe natural language question is: The molecule is a 1-alkylglycerone 3-phosphate(2-) obtained by deprotonation of the phosphate OH groups of 1-pentadecylglycerone 3-phosphate; major species at pH 7.3. It is a conjugate base of a 1-pentadecylglycerone 3-phosphate.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCOCC(=O)COP(=O)([O-])[O-]\\nThe natural language question is: The molecule is a pyridone that is 2-pyridone substituted with a carboxamide group at C-5 and a methyl group at N-1. It has a role as a metabolite and a mouse metabolite. It is a pyridinecarboxamide, a pyridone and a member of methylpyridines.\\nThe corresponding SMILES representation is:\\nCN1C=C(C=CC1=O)C(=O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is an oligosaccharide phosphate consisting of beta-D-galactose having 3-O-phosphono-beta-D-glucosyl and alpha-L-rhamnosyl residues attached at positions 4 and 2 respectively. It is an oligosaccharide phosphate and a trisaccharide derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@H]2O)CO)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)OP(=O)(O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29565',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organic iodide salt. It has a role as a fluorochrome. It contains a diIC18(7)(1+). It derives from a C7-indocyanine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCN\\\\\\\\1C2=CC=CC=C2C(/C1=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\C3=[N+](C4=CC=CC=C4C3(C)C)CCCCCCCCCCCCCCCCCC)(C)C.[I-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13207',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an amino disaccharide consisting of 2-acetamido-2-deoxy-beta-D-galactopyranose and 2-acetamido-2-deoxy-alpha-D-glucopyranose residues joined in sequence by a (1->4) glycosidic bond. It is an amino disaccharide and a member of acetamides.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1O)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)NC(=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1976',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a terminal acetylenic compound that is prop-2-yne substituted by a hydroxy group at position 1. It has a role as a Saccharomyces cerevisiae metabolite and an antifungal agent. It is a terminal acetylenic compound, a volatile organic compound and a propynol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C#CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14142',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a dihydroxyanthraquinone that is 9,10-anthraquinone bearing hydroxy substituents at positions 1 and 8, a methoxy group at position 3, and a methyl group at position 6. It has been widely isolated and characterised from both terrestrial and marine sources. It has a role as an apoptosis inducer, an antineoplastic agent, a hepatoprotective agent, an anti-inflammatory agent, an antibacterial agent, an antifungal agent and a metabolite. It derives from a 2-methylanthraquinone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C=C(C=C3O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_225',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a citrate salt of tandospirone , comprising equimolar amounts of citric acid and tandospirone. It is an anxiolytic drug used in the treatment of anxiety disorders. It has a role as an anxiolytic drug and an antidepressant. It contains a tandospirone(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[C@H]2C[C@@H]1[C@H]3[C@@H]2C(=O)N(C3=O)CCCCN4CCN(CC4)C5=NC=CC=N5.C(C(=O)O)C(CC(=O)O)(C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_605',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an aryl sulfide that is (2E)-3-phenyl-N-(2-sulfanylphenyl)prop-2-enamide in which the hydrogen of the thiol group is substituted by a 3-(dimethylamino)propyl group. It is a 5-hydroxytryptamine receptor antagonist and an inhibitor of SARS-CoV replication. It has a role as an EC 3.4.22.69 (SARS coronavirus main proteinase) inhibitor, an antiviral agent and an anticoronaviral agent. It is a tertiary amino compound, a secondary carboxamide, a member of cinnamamides and an aryl sulfide. It is a conjugate base of a cinanserin(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN(C)CCCSC1=CC=CC=C1NC(=O)/C=C/C2=CC=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_899',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an amino acid zwitterion arising from transfer of two protons from the carboxy to the amino groups of L-selenocystathionine; major species at pH 7.3. It is a tautomer of a L-selenocystathionine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C[Se]C[C@@H](C(=O)[O-])[NH3+])[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27299',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a ceramide 1-phosphate that is N-tetracosanoyl derivative of sphingosine. It derives from a sphingosine and a tetracosanoic acid. It is a conjugate acid of a N-tetracosanoylsphingosine 1-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)(O)O)[C@@H](/C=C/CCCCCCCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19039',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a (3R)-3-hydroxybutanoic acid oligomer resulting from the formal repeated intermolecular condensation of the hydroxy and carboxy groups of (3R)-3-hydroxybutanoic acid to give a chain in which five units of the monomer are linked together by four ester bonds. It derives from a (3R)-3-{[(3R)-3-{[(3R)-3-{[(3R)-3-hydroxybutanoyl]oxy}butanoyl]oxy}butanoyl]oxy}butanoic acid. It is a conjugate acid of a (3R)-3-{[(3R)-3-{[(3R)-3-{[(3R)-3-{[(3R)-3-hydroxybutanoyl]oxy}butanoyl]oxy}butanoyl]oxy}butanoyl]oxy}butanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CC(=O)O[C@H](C)CC(=O)O[C@H](C)CC(=O)O[C@H](C)CC(=O)O[C@H](C)CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5237',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a cyclic ketone consisting of cyclohex-3-en-1-one substituted at position 5 by a carboxy group and position 6 by an amino group. It is a beta-amino acid and a cyclic ketone. It is a tautomer of a 6-ammonio-5-oxocyclohex-2-ene-1-carboxylate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C=CC(C(C1=O)N)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6914',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a sphingoid that is C16 sphinganine bearing an additional 4R-hydroxy substituent. It derives from a hexadecasphing-4-enine. It is a conjugate base of a C16 phytosphingosine(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCC[C@H]([C@H]([C@H](CO)N)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7107',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a labdane diterpenoid that is isolated from the fruits of Vitex trifolia L and Vitex negundo. It has a role as a plant metabolite, an antineoplastic agent and an apoptosis inducer. It is a labdane diterpenoid, an acetate ester, a tertiary alcohol, a carbobicyclic compound and an olefinic compound.\\nThe corresponding SMILES representation is:\\nC[C@@H]1[C@H]([C@H](C2=C([C@]1(C)CCC(C)(C=C)O)CCCC2(C)C)OC(=O)C)OC(=O)C\\nThe natural language question is: The molecule is a disaccharide derivative that is 6-O-beta-D-xylopyranosyl-beta-D-glucopyranose having an (R)-mandelonitrile group at the anomeric position. It is a glycoside and a disaccharide derivative. It derives from a (R)-mandelonitrile.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O[C@@H](C#N)C3=CC=CC=C3)O)O)O)O)O)O\\nThe natural language question is: The molecule is a trioxilin having (all-cis 5,8,14) double bond configuration; and 10-, (11S)- and (12R)-hydroxy substituents. It derives from an all-cis-icosa-5,8,14-trienoic acid. It is a conjugate acid of a trioxilin B3(1-).\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C[C@H]([C@@H](C(/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O)O)O\\nThe natural language question is: The molecule is a long-chain fatty acid ethyl ester resulting from the formal condensation of the carboxy group of arachidic (icosanoic) acid with the hydroxy group of ethanol. It derives from an icosanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCC(=O)OCC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a branched tetrasaccharide derivative consisting of an alpha-D-mannosyl residue glycosidically linked to a 5-aminopentyl group and which carries at O-2 an alpha-D-mannosyl-(1->2)-alpha-D-mannosyl disaccharide unit and at O-4 a beta-D-galactosyl residue. It is a tetrasaccharide derivative and a glycoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCN)CCO[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15585',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a disaccharide consisting of beta-D-galactopyranose and D-glucitol joined by a 1->4 glycosidic bond. It is used as a laxative, as an excipient, and as replacement bulk sweetener in some low-calorie foods. It has a role as a laxative, an excipient and a cathartic.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O[C@H]([C@@H](CO)O)[C@@H]([C@H](CO)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10111',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion that is the conjugate base of carbamoyl adenylate, obtained by deprotonation of the phosphate group. It is a conjugate base of a carbamoyl adenylate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OC(=O)N)O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3093',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 3-hydroxy fatty acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (3R,23Z,26Z,29Z,32Z,35Z)-3-hydroxyoctatriacontapentaenoyl-CoA; major species at pH 7.3. It is a (R)-3-hydroxyacyl-CoA(4-), a 3-hydroxy fatty acyl-CoA(4-) and an 11,12-saturated fatty acyl-CoA(4-). It is a conjugate base of a (3R,23Z,26Z,29Z,32Z,35Z)-3-hydroxyoctatriacontapentaenoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCCC[C@H](CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23110',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 6-oxo monocarboxylic acid anion that is the conjugate base of 2-hydroxy-6-(2-hydroxyphenyl)-6-oxo-cis,cis-hexa-2,4-dienoic acid. It derives from a sorbate. It is a conjugate base of a 2-hydroxy-6-(2-hydroxyphenyl)-6-oxo-cis,cis-hexa-2,4-dienoic acid. It is a conjugate acid of a 2-hydroxy-6-(2-oxidophenyl)-6-oxo-cis,cis-hexa-2,4-dienoate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)C(=O)/C=C\\\\\\\\C=C(/C(=O)O)\\\\\\\\O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26628',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a diterpenoid isolated from the seed kernels of Caesalpinia crista that has been found to exhibit antimalarial activity. It has a role as a metabolite and an antimalarial. It is an acetate ester, a cyclic ether, an enone, a tertiary alcohol, a diterpenoid and an aromatic ketone.\\nThe corresponding SMILES representation is:\\nCC(=O)O[C@@H]1[C@@H]([C@@]2([C@H]3CC4=C(C=CO4)C(=O)[C@@H]3CC[C@]2(C([C@@H]1OC(=O)C)(C)C)O)C)OC(=O)C\\nThe natural language question is: The molecule is a purine nucleoside in which guanine is attached to arabinofuranose via a beta-N(9)-glycosidic bond. It inhibits DNA synthesis and causes cell death. It has a role as an antineoplastic agent and a DNA synthesis inhibitor. It is a beta-D-arabinoside and a purine nucleoside.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(N1[C@H]3[C@H]([C@@H]([C@H](O3)CO)O)O)N=C(NC2=O)N\\nThe natural language question is: The molecule is the anhydrous form of the calcium salt of fenprofen. The dihydrate form is used as a non-steroidal anti-inflammatory drug for the management of mild to moderate pain and for the relief of pain and inflammation associated with disorders such as arthritis. It has a role as a cyclooxygenase 2 inhibitor and a cyclooxygenase 1 inhibitor. It contains a fenoprofen(1-).\\nThe corresponding SMILES representation is:\\nCC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)[O-].CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)[O-].[Ca+2]\\nThe natural language question is: The molecule is a trans-3-enoyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (3E)-octenoyl-CoA; major species at pH 7.3. It is a trans-3-enoyl-CoA(4-) and a monounsaturated fatty acyl-CoA(4-). It is a conjugate base of a (3E)-octenoyl-CoA.\\nThe corresponding SMILES representation is:\\nCCCC/C=C/CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a pentol consisting of 1-(hydroxyimino)hexane with five hydroxy substituents placed at positions 2, 3, 4, 5 and 6. It is an aliphatic aldoxime, a pentol and an aldohexose derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(C(C(C(/C=N/O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26251',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of azabicycloalkanes that is 1-azabicyclo[3.2.0]heptan-7-one substituted at positions 3 and 6 by (2-aminoethyl)thio and ethyl groups respectively. It is a beta-lactam, an azabicycloalkane, an aliphatic sulfide and a primary amino compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1C2CC(CN2C1=O)SCCN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12060',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a 2-deoxyribose bisphosphate that is 2-deoxy-alpha-D-ribofuranose 3,5-bisphosphate in which the phosphate group at position 3 is esterfied by a 2,3-dideoxy-alpha-D-ribofuranos-5-yl group. It has a role as a Mycoplasma genitalium metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[C@H](O[C@@H]1COP(=O)(O)O[C@H]2C[C@H](O[C@@H]2COP(=O)(O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6041',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a pyridine alkaloid that is nicotine lacking the methyl group on the pyrrolidine nitrogen. It has a role as a metabolite and a nicotinic acetylcholine receptor agonist. It is a pyridine alkaloid and a pyrrolidine alkaloid. It derives from a hydride of a nicotine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C[C@H](NC1)C2=CN=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12813',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of dihydroflavonols that is (2S)-flavanone substituted by hydroxy groups at positions 3, 7 and 4'. It has a role as an antimutagen and a metabolite. It is a trihydroxyflavanone, a member of dihydroflavonols, a secondary alpha-hydroxy ketone and a member of 4'-hydroxyflavanones.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=CC=C1[C@@H]2[C@H](C(=O)C3=C(O2)C=C(C=C3)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24291',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a doubly-charged peptide anion arising from deprotonation of the four carboxy groups and protonation of the two amino groups of glutathione disulfide; major species at pH 7.3. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a glutathione disulfide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)N[C@@H](CSSC[C@@H](C(=O)NCC(=O)[O-])NC(=O)CC[C@@H](C(=O)[O-])[NH3+])C(=O)NCC(=O)[O-])[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9802',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 1-(Z)-alk-1-enyl-2-acyl-sn-glycero-3-phosphocholine in which the alk-1-enyl and acyl groups are specified as (1Z)-hexadecenyl and (4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl respectively. It is a phosphatidylcholine P-38:6 and a 1-(Z)-alk-1-enyl-2-acyl-sn-glycero-3-phosphocholine.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC/C=C\\\\\\\\OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC\\nThe natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH group of 17-(4-hydroxyphenyl)heptadecanoyl-AMP; the major species at pH 7.3. It is a conjugate base of a 17-(4-hydroxyphenyl)heptadecanoyl-AMP.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1CCCCCCCCCCCCCCCCC(=O)OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)O)O\\nThe natural language question is: The molecule is dianion of stipitatic acid arising from deprotonation of the carboxylic acid and 6-hydroxy groups. It is a 5-oxo monocarboxylic acid anion and a hydroxy monocarboxylic acid anion. It is a conjugate base of a stipitatic acid and a stipitatate(1-).\\nThe corresponding SMILES representation is:\\nC1=C(C=C(C(=CC1=O)[O-])[O-])C(=O)O\\nThe natural language question is: The molecule is a N-acetyl-L-amino acid that is the N-acetyl derivative of L-threonine. It is a L-threonine derivative and a N-acetyl-L-amino acid. It is a conjugate acid of a N-acetyl-L-threoninate.\\nThe corresponding SMILES representation is:\\nC[C@H]([C@@H](C(=O)O)NC(=O)C)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydroxy fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 3-methylnonanoic acid. It is a medium-chain fatty acyl-CoA and a methyl-branched fatty acyl-CoA. It is a conjugate acid of a 3-methylnonanoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCC(C)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4653',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an epoxy fatty acid consisting of (4Z,7Z,10Z,12E,14E,19Z)-docosahexaenoic acid having an epoxy group located at the 16,17-position. An intermediate lipid in specialized proresolving mediators. It has a role as a human xenobiotic metabolite. It is an epoxy fatty acid, a long-chain fatty acid, a polyunsaturated fatty acid and a docosanoid. It is a conjugate acid of a (16S,17S)-epoxy-(4Z,7Z,10Z,12E,14E,19Z)-docosahexaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C[C@H]1[C@@H](O1)/C=C/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24444',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a phenolate anion that is the conjugate base of catechol. It has a role as a plant metabolite. It is a conjugate base of a catechol. It is a conjugate acid of a catecholate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7536',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is the anion obtained by removal of a proton from the carboxylic acid group of cerivastatin. It is a conjugate base of a cerivastatin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C1=C(C(=C(C(=N1)C(C)C)COC)C2=CC=C(C=C2)F)/C=C/[C@H](C[C@H](CC(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4858',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a tripeptide consisting of AcTyrGlyGly with a (4-trimethylammoniophenyl)diazenyl group at the 3-position on the tyrosine phenyl ring. It is a tripeptide and a monoazo compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H](CC1=CC(=C(C=C1)O)N=NC2=CC=C(C=C2)[N+](C)(C)C)C(=O)NCC(=O)NCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10895',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is the D isomer of N-acetylglucosamine. It has a role as a bacterial metabolite. It is a N-acetylglucosamine and a N-acetyl-D-hexosamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](OC1O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_577',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a hydrochloride obtained by combining alectinib with one molar equivalent of hydrochloric acid. Used for the treatment of patients with anaplastic lymphoma kinase-positive, metastatic non-small cell lung cancer. It has a role as an antineoplastic agent and an EC 2.7.10.1 (receptor protein-tyrosine kinase) inhibitor. It contains an alectinib(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=CC2=C(C=C1N3CCC(CC3)N4CCOCC4)C(C5=C(C2=O)C6=C(N5)C=C(C=C6)C#N)(C)C.Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6269',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a purine arabinonucleoside monophosphate having 2-fluoroadenine as the nucleobase. A prodrug, it is rapidly dephosphorylated to 2-fluoro-ara-A and then phosphorylated intracellularly by deoxycytidine kinase to the active triphosphate, 2-fluoro-ara-ATP. Once incorporated into DNA, 2-fluoro-ara-ATP functions as a DNA chain terminator. It is used for the treatment of adult patients with B-cell chronic lymphocytic leukemia (CLL) who have not responded to, or whose disease has progressed during, treatment with at least one standard alkylating-agent containing regimenas. It has a role as an antimetabolite, an antineoplastic agent, an immunosuppressive agent, an antiviral agent, a prodrug and a DNA synthesis inhibitor. It is an organofluorine compound, a nucleoside analogue and a purine arabinonucleoside monophosphate. It derives from a 2-fluoroadenine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC2=C(N=C(N=C2N1[C@H]3[C@H]([C@@H]([C@H](O3)COP(=O)(O)O)O)O)F)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8743',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an optically active form of 15-HETE having 15(R)-configuration. It is a conjugate acid of a 15(R)-HETE(1-). It is an enantiomer of a 15(S)-HETE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23717',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxylamine that is N-hydroxyquinolin-4-one in which the hydrogen at position 2 has been replaced by a (1E)-non-1-en-1-yl group. It is the most active agent produced by Pseudomonas aeruginosa that modulates the growth and virulence of Staphylococcus aureus; the corresponding Z isomer is inactive. It has a role as a bacterial metabolite and an antibacterial agent. It is a member of hydroxylamines, a quinolone, an organic heterobicyclic compound and an olefinic compound.\\nThe corresponding SMILES representation is:\\nCCCCCCC/C=C/C1=CC(=O)C2=CC=CC=C2N1O\\nThe natural language question is: The molecule is a monocarboxylic acid anion resulting from the removal of the proton from the carboxy group of 5-hydroxyimidazole-4-acetic acid. It is a conjugate base of a 5-hydroxyimidazole-4-acetic acid.\\nThe corresponding SMILES representation is:\\nC1=NC(=C(N1)CC(=O)[O-])O\\nThe natural language question is: The molecule is a trans-3-enoyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (2E,5Z)-octadienoyl-CoA; major species at pH 7.3. It is a 2,3-trans-enoyl CoA(4-), a polyunsaturated fatty acyl-CoA(4-) and a 4-saturated trans-2-enoyl-CoA(4-). It is a conjugate base of a (2E,5Z)-octadienoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nThe natural language question is: The molecule is an N-acylphosphatidylethanolamine in which the N-acyl group is specified as butyryl while the phosphatidyl acyl groups at position 1 and 2 are specified as palmitoyl (hexadecanoyl) and linoleoyl (9Z,12Z-octadecadienoyl) respectively. It derives from a hexadecanoic acid, a linoleic acid and a butyric acid. It is a conjugate acid of a N-butyryl-1-palmitoyl-2-linoleoyl-sn-glycero-3-phosphoethanolamine(1-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)(O)OCCNC(=O)CCC)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of 5-hydroxypentanoic acid. It derives from a valerate. It is a conjugate base of a 5-hydroxypentanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCO)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2718',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a cembrane diterpenoid with cytotoxic activity isolated from the soft coral Lobophytum michaelae. It has a role as an antineoplastic agent and a coral metabolite. It is a gamma-lactone, an acetate ester, a cembrane diterpenoid, an epoxide, a macrocycle and a cyclic terpene ketone.\\nThe corresponding SMILES representation is:\\nC/C/1=C\\\\\\\\CC(=O)/C(=C/C[C@H]([C@]2([C@@H](O2)[C@@H]3[C@@H]([C@@H](C1)OC(=O)C)C(=C)C(=O)O3)C)OC(=O)C)/C\\nThe natural language question is: The molecule is a tricarboxylic acid trianion obtained by deprotonation of the three carboxy groups of (-)-threo-isodihomocitric acid; major species at pH 7.3. It is a conjugate base of a (-)-threo-isodihomocitric acid.\\nThe corresponding SMILES representation is:\\nC(C[C@@H]([C@H](C(=O)[O-])O)C(=O)[O-])CC(=O)[O-]\\nThe natural language question is: The molecule is an organosulfonic acid that is 2-oxoethanesulfonic acid substituted by a (2-ethyl-6-methylphenyl)(1-methoxypropan-2-yl)amino group at postion 2. It is an ether, an aromatic amide and an organosulfonic acid.\\nThe corresponding SMILES representation is:\\nCCC1=CC=CC(=C1N(C(C)COC)C(=O)CS(=O)(=O)O)C\\nThe natural language question is: The molecule is a triterpenoid that is an intermediate in the biosynthesis of alpha-onocerin by the fern Lycopodium clavatum. It has a role as a plant metabolite. It is a triterpenoid, a carbobicyclic compound, a secondary alcohol, an olefinic compound and an epoxide.\\nThe corresponding SMILES representation is:\\nC/C(=C\\\\\\\\CC[C@H]1C(=C)CC[C@@H]2[C@@]1(CC[C@@H](C2(C)C)O)C)/CC/C=C(\\\\\\\\C)/CC[C@H]3C(O3)(C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of adenosines obtained by replacement of the 5'-hydroxy group of adenosine by a methylsulfinyl group. It has a role as a Camellia sinensis metabolite and a human urinary metabolite. It is a member of adenosines and a sulfoxide.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CS(=O)C[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27664',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a cyanohydrin that is obtained by the formal addition of hydrogen cyanide to the aldehyde group of 4-hydroxybenzaldehyde. It derives from a mandelonitrile.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C(C#N)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10342',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 3-oxo-fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (15Z)-3-oxotetracosenoic acid. It is a 3-oxo-fatty acyl-CoA, a very long-chain fatty acyl-CoA and a monounsaturated fatty acyl-CoA. It is a conjugate acid of a (15Z)-3-oxotetracosenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCCCCCCCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6619',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 3-hydroxypentanoic acid in which the chiral centre at position 3 has R-configuration. It is a conjugate acid of a (R)-3-hydroxypentanoate. It is an enantiomer of a (S)-3-hydroxypentanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17244',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a primary aliphatic amine that consists of cyclopropane bearing a single amino substituent. It has a role as a mouse metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC1N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28000',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is the 2-aminoethyl glycoside of an amino decasaccharide made of two alpha-L-Rhap-(1->2)-alpha-L-Rhap-(1->3)-[alpha-D-Glcp-(1->4)]-alpha-L-Rhap-(1->3)-beta-D-GlcpNAc repeating units of the Shigella flexneri serotype 2a specific polysaccharide linked (1->2).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@@H]([C@H]([C@@H](O[C@H]2O[C@H]3[C@H]([C@@H](O[C@H]([C@@H]3O[C@@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)C)O[C@@H]5[C@H]([C@@H](O[C@@H]([C@H]5O)CO)O[C@@H]6[C@@H]([C@H]([C@@H](O[C@H]6O[C@@H]7[C@@H]([C@H]([C@@H](O[C@H]7O[C@H]8[C@H]([C@@H](O[C@H]([C@@H]8O[C@@H]9[C@@H]([C@H]([C@@H]([C@H](O9)CO)O)O)O)C)O[C@@H]1[C@H]([C@@H](O[C@@H]([C@H]1O)CO)OCCN)NC(=O)C)O)C)O)O)C)O)O)NC(=O)C)O)C)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8793',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a formamide that is the D-enantiomer of N-formylkynurenine. It is a D-alpha-amino acid and a member of formamides. It is a tautomer of a N-formyl-D-kynurenine zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)C(=O)C[C@H](C(=O)O)N)NC=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28567',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a glycopeptidolipid antigen from clinically prominent members of the Mycobacterium avium serocomplex It has a role as an antigen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(CC(=O)N[C@H](CC1=CC=CC=C1)C(=O)N[C@H]([C@@H](C)OC2[C@@H]([C@@H]([C@@H]([C@@H](O2)C)O)O)O[C@H]3[C@@H]([C@@H]([C@H]([C@@H](O3)C)O)O[C@H]4[C@@H]([C@H]([C@H]5[C@H](O4)CO[C@](O5)(C)C(=O)O)OC)O)O)C(=O)N[C@H](C)C(=O)N[C@@H](C)COC6[C@@H]([C@@H]([C@H]([C@@H](O6)C)OC)OC)O)O.O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11043',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organic heterotetracyclic compound that is 7,12-dihydropyrido[3',2':2,3]azepino[4,5-b]indole substituted at positions 6 and 9 by oxo and bromo groups respectively. It has a role as an EC 2.7.11.26 (tau-protein kinase) inhibitor and a Wnt signalling activator. It is an organic heterotetracyclic compound, an organonitrogen heterocyclic compound, a lactam and an organobromine compound.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1C2=C(C3=C(C=CC=N3)NC1=O)NC4=C2C=C(C=C4)Br'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19477',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is the monocarboxylic acid amide resulting from the formal condensation of the aryl amino group of 3-methyl-N-phenyl-1-(2-phenylethyl)piperidin-4-amine with propanoic acid. It has a role as an opioid analgesic, a mu-opioid receptor agonist and a sedative. It is a member of piperidines and a monocarboxylic acid amide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(=O)N(C1CCN(CC1C)CCC2=CC=CC=C2)C3=CC=CC=C3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9407',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is pentaanion of 2,3-didehydropimeloyl-CoA arising from deprotonation of phosphate, diphosphate and carboxylic acid functions. It is a conjugate base of a 2,3-didehydropimeloyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)C=CCCCC(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12847',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a non-proteinogenic L-alpha-amino acid that is L-arginine substituted by a methyl group at position 5. It derives from a L-arginine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CC[C@@H](C(=O)O)N)N=C(N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17372',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a C82 mycolic acid having a C56 meromycolic chain with two cis cyclopropyl functions and a saturated C26 alpha-branch. It is produced by Mycobacterium tuberculosis H37Ra. It has a role as a bacterial metabolite. It is a mycolic acid and a hydroxy fatty acid. It is a conjugate acid of a (2R)-2-[(1R)-1-hydroxy-22-{2-[10-(2-octadecylcyclopropyl)decyl]cyclopropyl}docosyl]hexacosanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCC[C@H]([C@@H](CCCCCCCCCCCCCCCCCCCCCC1CC1CCCCCCCCCCC2CC2CCCCCCCCCCCCCCCCCC)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14699',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a 2-oxo monocarboxylic acid that is 2-oxopentanoic acid in which C-3 is methyl-substituted and C-5 is substituted by a carbamimidamido group. It contains a guanidino group. It derives from a valeric acid. It is a tautomer of a 5-guanidino-3-methyl-2-oxopentanoic acid zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCN=C(N)N)C(=O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12852',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 1,2-diacyl-sn-glycerol 3-phosphate in which the acyl substituents at positions 1 and 2 are specified as oleoyl and (6Z)-octadecenoyl respectively. It derives from an oleic acid and a petroselinic acid. It is a conjugate acid of a 1-oleoyl-2-(6Z)-octadecenoyl-sn-glycero-3-phosphate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCC/C=C\\\\\\\\CCCCC(=O)O[C@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)COP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15868',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a phosphatidylcholine O-38:3 in which the alkyl and acyl groups specified at positions 1 and 2 are octadecyl and (8Z,11Z,14Z)-eicosatrienoyl respectively. It is a phosphatidylcholine O-38:3 and a 2-acyl-1-alkyl-sn-glycero-3-phosphocholine. It derives from an all-cis-icosa-8,11,14-trienoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCOC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2993',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an epoxy fatty acid that is (6E,8E,10Z,13Z,15E,19Z)-docosa-6,8,10,13,15,19-hexaenoic acid which is carrying an epoxy group at position 4S and a hydroxy group at position 17R. It is a metabolite of docosahexaenoic acid that can be converted to a D-resolvin. It has a role as a metabolite. It is a hydroxydocosahexaenoic acid and an epoxy fatty acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C[C@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\C1[C@@H](O1)CCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29492',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a phosphatidylcholine 34:3 in which the acyl groups specified at positions 1 and 2 are palmitoyl and (9Z,12Z,15Z)-octadecatrienoyl (alpha-linolenoyl) respectively. It has a role as a mouse metabolite. It derives from an alpha-linolenic acid and a hexadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26706',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an aldehyde resulting from the formal oxidation of methanol. It has a role as a carcinogenic agent, an allergen, an EC 3.5.1.4 (amidase) inhibitor, a disinfectant, an environmental contaminant, a Saccharomyces cerevisiae metabolite, an Escherichia coli metabolite and a mouse metabolite. It is a one-carbon compound and an aldehyde.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3511',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is chemical element (nickel group element atom) with atomic number 46. It is a nickel group element atom, a platinum group metal atom and a metal allergen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[Pd]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29104',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an N-alkylpiperazine that is butane which is substituted by a 3,5-dioxopiperazin-1-yl group at positions 2 and 3. The meso isomer. It has a role as an EC 5.99.1.3 [DNA topoisomerase (ATP-hydrolysing)] inhibitor, an apoptosis inducer and an antineoplastic agent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@H](C)N1CC(=O)NC(=O)C1)N2CC(=O)NC(=O)C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23433',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a tirucallane triterpenoid that is (13alpha,14beta,17alpha,20S)-lanosta-2,7-dien-1-one substituted by an oxo group at position 1. It has been isolated from the stem and stem barks of Cornus walteri. It has a role as a plant metabolite. It is a tirucallane triterpenoid, a cyclic terpene ketone and an enone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](CCCC(C)C)[C@@H]1CC[C@]2([C@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(C(=O)C=CC4(C)C)C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20424',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a 2-oxo monocarboxylic acid and an epsilon-amino acid. It has a role as a human metabolite. It derives from a hexanoic acid. It is a tautomer of a 6-amino-2-oxohexanoic acid zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCN)CC(=O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2266',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a medium-chain fatty acid that is decanoic acid substituted at position 3 by a hydroxy group. It has a role as an antimitotic and an Escherichia coli metabolite. It is a 3-hydroxy fatty acid and a medium-chain fatty acid. It derives from a decanoic acid. It is a conjugate acid of a 3-hydroxydecanoate.\\nThe corresponding SMILES representation is:\\nCCCCCCCC(CC(=O)O)O\\nThe natural language question is: The molecule is a high-mannose oligosaccharide that is beta-D-mannopyranose in which the hydrogens of hydroxy groups are replaced by an alpha-D-mannopyranosyl-group at position 6, an alpha-D-mannopyranosyl-(1right2)-alpha-D-mannopyranosyl-(1right2)-alpha-D-mannopyranosyl group at position 3, and a chitobiose group at position 1. It is an amino sugar, an amino heptasaccharide, an acetamide and a high-mannose oligosaccharide.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@@H]2[C@H](OC([C@@H]([C@H]2O)NC(=O)C)O)CO)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O)O[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O[C@@H]7[C@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O)O)O\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (2E,18Z,21Z,24Z,27Z,30Z,33Z)-hexatriacontaheptaenoic acid. It is an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It is a conjugate acid of a (2E,18Z,21Z,24Z,27Z,30Z,33Z)-hexatriacontaheptaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is an ADP-L-glycero-D-manno-heptose having beta-configuration at the anomeric centre of the heptose. It is a conjugate acid of an ADP-L-glycero-beta-D-manno-heptose(2-).\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)OP(=O)(O)O[C@H]4[C@H]([C@H]([C@@H]([C@H](O4)[C@H](CO)O)O)O)O)O)O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is an amino tetrasaccharide comprising alpha-D-galactose, beta-D-galactose, N-acetyl-beta-D-glucosamine and (at the reducing end) alpha-D-mannose residues linked sequentially (1->3), (1->3) and (1->2). It is a glucosamine oligosaccharide and an amino tetrasaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@H]2[C@H]([C@@H]([C@H](O[C@@H]2O)CO)O)O)CO)O)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O[C@@H]4[C@@H]([C@H]([C@H]([C@H](O4)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13615',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a fatty acid methyl ester resulting from the formal condensation of the carboxy group of 12(R)-HPETE with methanol. It derives from an icosa-5,9,11,14-tetraenoic acid.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\[C@@H](C/C=C\\\\\\\\CCCC(=O)OC)OO\\nThe natural language question is: The molecule is a monocarboxylic acid anion resulting from the deprotonation of the tetrazole NH group and carboxy group of candesartan. It is the major species at pH 7.3. It is a conjugate base of a candesartan.\\nThe corresponding SMILES representation is:\\nCCOC1=NC2=CC=CC(=C2N1CC3=CC=C(C=C3)C4=CC=CC=C4C5=NN=N[N-]5)C(=O)[O-]\\nThe natural language question is: The molecule is a lipid hydroperoxide, obtained by the formal substitution of a hydrogen at position 10 of (8E,12Z,15Z)-octadeca-10,12,15-trienoic acid by a hydroperoxy group (the 10S stereoisomer). It is a lipid hydroperoxide and a monocarboxylic acid. It is a conjugate acid of an (8E,10S,12Z,15Z)-10-hydroperoxyoctadec-8,12,15-trienoate.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C[C@@H](/C=C/CCCCCCC(=O)O)OO\\nThe natural language question is: The molecule is an organophosphate oxoanion arising from deprotonation of the phosphate OH groups of 2-hydroxypropyl dihydrogen phosphate; major species at pH 7.3. It is a conjugate base of a 2-hydroxypropyl dihydrogen phosphate.\\nThe corresponding SMILES representation is:\\nCC(COP(=O)([O-])[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a kaempferol O-glucoside that is kaempferol attached to a alpha-L-rhamnopyranosyl(1->2)-beta-D-glucopyranosyl residue at position 3 and a alpha-L-rhamnopyranosyl residue at position 7. Isolated from the aerial parts of Vicia faba and Lotus edulis, it exhibits inhibitory activity against topoisomerase I. It has a role as a metabolite, an EC 5.99.1.2 (DNA topoisomerase) inhibitor and a plant metabolite. It is an alpha-L-rhamnoside, a kaempferol O-glucoside and a dihydroxyflavone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC3=C(OC4=CC(=CC(=C4C3=O)O)O[C@H]5[C@@H]([C@@H]([C@H]([C@@H](O5)C)O)O)O)C6=CC=C(C=C6)O)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11126',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the ketoaldonic acid phosphate formed formally from L-erythronic acid by oxidation of the 3-hydroxy group to an oxo group and phosphorylation at the 1-hydroxy group. It derives from a L-erythronic acid. It is a conjugate acid of a (S)-2-hydroxy-3-oxo-4-(phosphonatooxy)butanoate(3-). It is an enantiomer of a (R)-2-hydroxy-3-oxo-4-(phosphonooxy)butanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(=O)[C@@H](C(=O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4773',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an O-acylcarnitine having stearoyl (octadecanoyl) as the acyl substituent. It has a role as a human metabolite. It derives from an octadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCC(=O)OC(CC(=O)[O-])C[N+](C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14791',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is tetraanion of vinylacetyl-CoA arising from deprotonation of phosphate and diphosphate functions. It is a conjugate base of a vinylacetyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CC=C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9019',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a furanochrome in which the basic tricyclic skeleton is substituted at positions 4 and 9 with methoxy groups and at position 7 with a methyl group. A major constituent of the plant Ammi visnaga it is a herbal folk medicine used for various illnesses, its main effect being as a vasodilator. It has a role as a vasodilator agent, a bronchodilator agent, an anti-asthmatic agent and a cardiovascular drug. It is an organic heterotricyclic compound, an oxacycle and a furanochromone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=O)C2=C(C3=C(C(=C2O1)OC)OC=C3)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24039',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a beta-lactam that is 7-oxo-1-azabicyclo[3.2.0]heptane-2-carboxylic acid carring additional (2-aminoethyl)sulfanyl and ethyl substituents at positions 3 and 6 respectively. An intermediate in the biosynthesis of carbapenem. It has a role as a bacterial metabolite. It is a beta-lactam, an aliphatic sulfide, a monocarboxylic acid, an organic heterobicyclic compound and a primary amino compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1C2CC(C(N2C1=O)C(=O)O)SCCN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_133',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a 17alpha-hydroxy-C21-steroid that is cortisol in which the 4-5 double bond has undergone formal hydrogenation to give the corresponding 5beta- steroid. It is a 21-hydroxy steroid, a 17alpha-hydroxy-C21-steroid, an 11beta-hydroxy steroid, a 3-oxo-5beta-steroid, a primary alpha-hydroxy ketone, a triol, a secondary alcohol, a diketone, a 20-oxo steroid and a tertiary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CCC(=O)C[C@H]1CC[C@@H]3[C@@H]2[C@H](C[C@]4([C@H]3CC[C@@]4(C(=O)CO)O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18570',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a dihydroxy monocarboxylic acid that consists of octadecanoic (stearic) acid bearing two hydroxy substituents at positions 3 and 18. It is an omega-hydroxy fatty acid, a 3-hydroxy fatty acid, a dihydroxy monocarboxylic acid and a hydroxyoctadecanoic acid. It is a conjugate acid of a 3,18-dihydroxyoctadecanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCCCCC(CC(=O)O)O)CCCCCCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20865',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a bioactive sphingoid, sphinganine, in which the terminal hydroxy group has been replaced by a hydrogen. It has a role as an antineoplastic agent. It is a sphingoid and an amino alcohol. It derives from a sphinganine. It is a conjugate base of a 1-deoxysphinganine(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCC[C@H]([C@H](C)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25996',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an L-amino acid anion, being the conjugate base of L-3-oxoalanine. It has a role as a Saccharomyces cerevisiae metabolite. It is a 3-oxo monocarboxylic acid anion and a L-alpha-amino acid anion. It is a conjugate base of a L-3-oxoalanine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(=O)[C@@H](C(=O)[O-])N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24573',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is the anion resulting from the removal of a proton from the carboxylic acid group of 5-dehydro-4-deoxy-D-glucuronic acid. It is a conjugate base of a 5-dehydro-4-deoxy-D-glucuronic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]([C@H](C=O)O)O)C(=O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1480',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the methyl ester of leukotriene E4, the esterified acid group being the one forming position 1 of the icosatetraenyl chain. Leukotriene E4 methyl ester is a more lipid-soluble form of leukotriene E4. It is a leukotriene and a methyl ester. It derives from a leukotriene E4.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\[C@H]([C@H](CCCC(=O)OC)O)SC[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1382',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an optically active cyclic ketone consisting of 3,5,6-trihydroxycyclohexa-2,4-dien-1-one bearing two 3-methylbut-2-en-1-yl substituents at positions 4 and 6 as well as a 3-methylbutanoyl group at the 2-position. It has a role as an antibacterial drug, an antioxidant, a cyclooxygenase 2 inhibitor and a metabolite. It is a diketone, a triol, a cyclic ketone, an aromatic ketone and a tertiary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)CC(=O)C1=C(C(=C([C@@](C1=O)(CC=C(C)C)O)O)CC=C(C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14637',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is the amide resulting from the formal condensation of 4-[(biphenyl-2-ylcarbonyl)amino]benzoic acid with the benzazepine nitrogen of 2-methyl-1,4,5,6-tetrahydroimidazo[4,5-d][1]benzazepine. It is an antagonist for two of the three types of arginine vasopressin (AVP) receptors, V1a and V2. It is used as its hydrochloride salt for the treatment of hyponatraemia (low blood sodium levels) caused by syndrome of inappropriate antidiuretic hormone (SIADH). It has a role as a vasopressin receptor antagonist.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=NC2=C(N1)CCN(C3=CC=CC=C32)C(=O)C4=CC=C(C=C4)NC(=O)C5=CC=CC=C5C6=CC=CC=C6'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7632',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a 3-oxo monocarboxylic acid that is dihydroferulic acid in which the benzylic methylene group has been oxidised to give the corresponding ketone. It is a 3-oxo monocarboxylic acid, a member of phenols, an aromatic ether and an aromatic ketone. It derives from a dihydroferulic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)C(=O)CC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10937',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a hydroxamic acid that is N-hydroxy-D-valinamide in which the alpha-amino group has been substituted by isopropoxy and [biphenyl]-4-ylsulfonyl groups. A selective matrix metalloproteinase-2 (MMP-2) inhibitor, it is one of the most potent inducers of autophagy. Its physiological roles include angiogenesis, cancer metastasis, embryogenesis, tissue remodeling in development, and wound healing. It has a role as an EC 3.4.24.24 (gelatinase A) inhibitor, an autophagy inducer, an antineoplastic agent and a melanin synthesis inhibitor. It is a hydroxamic acid and a D-valine derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)[C@H](C(=O)NO)N(OC(C)C)S(=O)(=O)C1=CC=C(C=C1)C2=CC=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5560',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a 3-oxoacyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 4-isopropenyl-2-oxocyclohexane-1-carboxylic acid. It has a role as a mouse metabolite. It is an acyl-CoA and a 3-oxo-fatty acyl-CoA. It derives from a cyclohexane-1-carbonyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=C)C1CCC(C(=O)C1)C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13539',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a 17beta-hydroxy steroid that is testosterone in which the 17beta hydrogen is replaced by an ethynyl group. Ethisterone was the first orally active progestin and is a metabolite of danazol. It has a role as a progestin and a drug metabolite. It is a 17beta-hydroxy steroid, a 3-oxo-Delta(4) steroid, a terminal acetylenic compound and a tertiary alcohol. It derives from a testosterone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2CC[C@]4([C@H]3CC[C@]4(C#C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18779',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an L-alpha-amino acid anion having methylselanylmethyl as the side-chain. It is a conjugate base of a Se-methyl-L-selenocysteine. It is an enantiomer of a Se-methyl-D-selenocysteinate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[Se]C[C@@H](C(=O)[O-])N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26006',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an organic salt obtained by combining guanidine with one molar equivalent of acetic acid. It contains a guanidinium and an acetate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)O.C(=N)(N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5667',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a dimethoxyflavone that is the 7,4'-dimethyl ether derivative of apigenin. It has a role as a plant metabolite. It is a dimethoxyflavone and a monohydroxyflavone. It derives from an apigenin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC=C(C=C1)C2=CC(=O)C3=C(C=C(C=C3O2)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18433',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an N-acyl-L-alpha-amino acid anion resulting from deprotonation of both carboxy groups and protonation of the amide nitrogen of N-acetyl-L-glutamic acid. It derives from a L-glutamate(1-). It is a conjugate base of a N-acetyl-L-glutamic acid. It is a conjugate acid of a N-acetyl-L-glutamate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)[NH2+][C@@H](CCC(=O)[O-])C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18769',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an isotopically modified compound, an organic sodium salt and a sodium 2-iodohippurate. It has a role as a radiopharmaceutical. It derives from a N-benzoylglycine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)C(=O)NCC(=O)[O-])[131I].[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28032',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an amino acid zwitterion. It is a conjugate base of a cysteinium. It is a conjugate acid of a cysteinate(1-). It is a tautomer of a cysteine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(C(=O)[O-])[NH3+])S'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8765',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a binaphthopyran resulting from the oxidative coupling at position 8 of two molecules of monapinone E. It is a binaphthopyran and a naphtho-alpha-pyrone. It derives from a monapinone E.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC2=CC3=C(C(=C2C(=C1C4=C(C5=C(C6=C(C[C@H](OC6=O)C[C@@H](C[C@@H](CCCCCO)O)O)C=C5C=C4OC)O)O)O)O)C(=O)O[C@@H](C3)C[C@@H](C[C@@H](CCCCCO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24211',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a steroid phosphate that is the 21-O-phospho derivative of cortisol. It is a cortisol ester, a steroid phosphate, an 11beta-hydroxy steroid, a 3-oxo-Delta(4) steroid, a 17alpha-hydroxy steroid and a tertiary alpha-hydroxy ketone. It is a conjugate acid of a cortisol phosphate(2-).\\nThe corresponding SMILES representation is:\\nC[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2[C@H](C[C@]4([C@H]3CC[C@@]4(C(=O)COP(=O)(O)O)O)C)O\\nThe natural language question is: The molecule is a tetrapeptide composed of two L-alanine units linked to two L-aspartic acid units by peptide linkages. It has a role as a metabolite. It derives from a L-alanine and a L-aspartic acid.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(=O)O)C(=O)O)N\\nThe natural language question is: The molecule is an acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of (15Z)-3-oxotetracosenoyl-CoA. It is a 3-oxo-fatty acyl-CoA(4-) and an 11,12-saturated fatty acyl-CoA(4-). It is a conjugate base of a (15Z)-3-oxotetracosenoyl-CoA.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nThe natural language question is: The molecule is a tripeptide resulting from the formal condensation of the carboxy group of N(5)-[amino(sulfoamino)phosphoryl]-L-ornithine with the N-terminal amino group of L-alanyl-L-arginine. A toxin produced by cultured Pseudomonas syringae, the causal agent of halo blight disease in bean plants. It has a role as a bacterial metabolite, an antineoplastic agent and an EC 2.1.3.3 (ornithine carbamoyltransferase) inhibitor. It is a tripeptide and a member of guanidines. It derives from a sulfamic acid.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)O)NC(=O)[C@H](CCCNP(=O)(N)NS(=O)(=O)O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is a monoterpene that is bicyclo[3.1.1]heptane substituted by methyl groups at positions 2, 6 and 6. It has a role as a plant metabolite. It is a terpenoid fundamental parent, a monoterpene and a carbobicyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1CCC2CC1C2(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6942',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a UDP-amino sugar having N-acetyl-2-amino-2-deoxy-D-glucuronic acid as the sugar component. It derives from a D-glucuronic acid. It is a conjugate acid of an UDP-N-acetyl-2-amino-2-deoxy-D-glucuronate and an UDP-2-acetamido-2-deoxy-D-glucuronate(3-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](OC1OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27780',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a dipeptide formed from L-lysine and L-tyrosine residues. It has a role as a metabolite. It derives from a L-lysine and a L-tyrosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C[C@@H](C(=O)O)NC(=O)[C@H](CCCCN)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15854',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a dicarboxylic acid dianion obtained by deprotonation of both carboxy groups of 4-amino-4-deoxychorismic acid. It has a role as a Saccharomyces cerevisiae metabolite. It derives from a chorismate(2-). It is a conjugate base of a 4-amino-4-deoxychorismate(1-).\\nThe corresponding SMILES representation is:\\nC=C(C(=O)[O-])O[C@@H]1C=C(C=C[C@H]1N)C(=O)[O-]\\nThe natural language question is: The molecule is a member of the class of xanthones that is 9H-xanthen-9-one substituted by hydroxy groups at positions 1, 5 and 6. It has a role as a plant metabolite. It is a member of xanthones and a polyphenol.\\nThe corresponding SMILES representation is:\\nC1=CC(=C2C(=C1)OC3=C(C2=O)C=CC(=C3O)O)O\\nThe natural language question is: The molecule is a salinosporamide in which the core (1R)-6-oxa-2-azabicyclo[3.2.0]heptane-3,7-dione skeleton is substituted at positions 1, 4, and 5 by (1S)-cyclohex-2-en-1-yl(hydroxy)methyl, 2-chloroethyl, and methyl groups, respectively (the 1R,4R,5S diastereoisomer). A potent proteasome inhibitor, it has attracted interest for potential use in the treatment of various cancers. It has a role as an antineoplastic agent and a proteasome inhibitor. It is a salinosporamide, an organochlorine compound, an organic heterobicyclic compound, a beta-lactone and a gamma-lactam.\\nThe corresponding SMILES representation is:\\nC[C@]12[C@H](C(=O)N[C@]1(C(=O)O2)[C@H]([C@H]3CCCC=C3)O)CCCl\\nThe natural language question is: The molecule is a member of the class of phenols that is indan which has been hydroxylated at position 5. It is a member of phenols and a member of indanes.\\nThe corresponding SMILES representation is:\\nC1CC2=C(C1)C=C(C=C2)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a primary alcohol that is butan-1-ol substituted by a chloro group at position 4. It has a role as a metabolite. It is a primary alcohol and an organochlorine compound. It derives from a butan-1-ol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCl)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13789',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organic cation that is the conjugate acid of (R,R)-asenapine, obtained by protonation of the tertiary amino group. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a (R,R)-asenapine. It is an enantiomer of a (S,S)-asenapine(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[NH+]1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=CC=C24'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5618',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a member of mugineic acids. It derives from a mugineic acid. It is a conjugate acid of a 3-epi-3-hydroxy-2'-deoxymugineate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]([C@H](N1CC[C@@H](C(=O)O)NCC[C@@H](C(=O)O)O)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14319',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of ureas in which three of the four hydrogens are replaced by 4-fluorobenzyl, 1-methylpiperidin-4-yl, and 4-(isopropyloxy)benzyl groups. An atypical antipsychotic that is used (in the form of its tartrate salt) for treatment of hallucinations and delusions associated with Parkinson's disease. It has a role as an antipsychotic agent, a 5-hydroxytryptamine 2A receptor inverse agonist and a serotonergic antagonist. It is a member of ureas, a member of piperidines, a member of monofluorobenzenes, an aromatic ether and a tertiary amino compound. It is a conjugate base of a pimavanserin(1+).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C3CCN(CC3)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27316',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a benzofuran consisting of fused benzene and furan rings. It is the parent compound of the class of 1-benzofurans. It is a benzofuran and a member of 1-benzofurans.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C=CO2\\nThe natural language question is: The molecule is a pyrazole that is 1H-pyrazole bearing an ethyl group at position 3, a 2-hydroxyphenyl group at position 2, and a 4-methoxyphenyl group at position 5. It has a role as a metabolite. It is a member of phenols, a member of pyrazoles and a monomethoxybenzene.\\nThe corresponding SMILES representation is:\\nCCC1=C(C(=NN1)C2=CC=C(C=C2)OC)C3=CC=CC=C3O\\nThe natural language question is: The molecule is the current microcin B17, CHEBI:64624 is a precursor, see fig. 2 It has a role as a DNA synthesis inhibitor.\\nThe corresponding SMILES representation is:\\nCC[C@H](C)[C@@H](C(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCC1=NC(=CO1)C2=NC(=CS2)C(=O)NCC(=O)NCC(=O)N[C@@H](CCC(=O)N)C(=O)NCC(=O)NCC(=O)NCC3=NC(=CS3)C(=O)NCC(=O)NCC4=NC(=CS4)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(=O)N)C(=O)NCC5=NC(=CS5)C6=NC(=CO6)C(=O)NCC(=O)NCC(=O)N[C@@H](CC(=O)N)C(=O)NCC(=O)NCC7=NC(=CO7)C(=O)NCC(=O)NCC8=NC(=CO8)C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CC9=CNC=N9)C(=O)N[C@@H]([C@@H](C)CC)C(=O)O)NC(=O)CNC(=O)[C@H](C(C)C)N\\nThe natural language question is: The molecule is a peptide zwitterion obtained by transfer of a proton from the carboxy to the amino group of bacilysin; major species at pH 7.3. It is a tautomer of a bacilysin.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@@H](C[C@@H]1CCC(=O)[C@H]2[C@@H]1O2)C(=O)[O-])[NH3+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is an N-alkylpiperazine in which the two amino hydrogens of piperazine have been replaced by diphenylmethyl and 3-(2-phenyl-1,3-dioxolan-2-yl)propyl groups. A calcium channel blocker and serotonin (5-HT2) receptor antagonist used in the treatment of migraine. It has a role as a calcium channel blocker, a serotonergic antagonist and a vasodilator agent. It is a N-alkylpiperazine, a dioxolane and a cyclic ketal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CN(CCN1CCCC2(OCCO2)C3=CC=CC=C3)C(C4=CC=CC=C4)C5=CC=CC=C5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27454',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a flavonolignan isoalted from Mimosa diplotricha. It has a role as a plant metabolite. It is a flavonolignan, a dimethoxybenzene, a polyphenol and a benzodioxine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=CC(=C1O)OC)[C@@H]2[C@H](OC3=C(O2)C=C(C=C3)C4=CC(=O)C5=C(C=C(C=C5O4)O)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_728',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a metallophthalocyanine consisting of a phthalocyanine-disulfonic acid bound to a central copper atom. Used in the form of an arylguanidinium salt as a histological dye. It has a role as a fluorochrome and a histological dye. It is a member of metallophthalocyanines, an arenesulfonic acid and a copper coordination entity. It is a conjugate acid of a Luxol fast blue MBS(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=C(C(=CC=C8)S(=O)(=O)O)C(=N7)N=C2[N-]3)C9=C4C=CC=C9S(=O)(=O)O.[Cu]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15547',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a 6-lactoyl-5,6,7,8-tetrahydropterin in which the stereocentre at position 6 has R-configuration. It is a 6-lactoyl-5,6,7,8-tetrahydropterin and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C(=O)[C@H]1CNC2=C(N1)C(=O)NC(=N2)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9189',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of the class of resolvins that is (6E,8E,10Z,13Z,15E,19Z)-docosahexaenoic acid carrying three hydroxy substituents at positions 4, 5 and 17 (the 4S,5R,17S-stereoisomer). It has a role as an anti-inflammatory agent and a human xenobiotic metabolite. It is a resolvin, a secondary allylic alcohol, a triol and a hydroxy polyunsaturated fatty acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C[C@@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\[C@H]([C@H](CCC(=O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9439',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a triphenylacetate salt obtained by combining vilanterol with one equivalent of triphenylacetic acid. Used in combination with fluticasone furoate for treatment of bronchospasm associated with chronic obstructive pulmonary disease. It has a role as a beta-adrenergic agonist and a bronchodilator agent. It contains a vilanterol(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)C(=O)O.C1=CC(=C(C(=C1)Cl)COCCOCCCCCCNC[C@@H](C2=CC(=C(C=C2)O)CO)O)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16624',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a C19-gibberellin that is a pentacyclic diterpenoid responsible for promoting growth and elongation of cells in plants. Initially identified in Gibberella fujikuroi it differs from gibberellin A1 in the absence of OH groups at C-2 and C-7 (gibbane numberings). It has a role as a plant metabolite and a mouse metabolite. It is a lactone, a C19-gibberellin and a gibberellin monocarboxylic acid. It is a conjugate acid of a gibberellin A9(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CCC[C@@]3([C@@H]1[C@@H]([C@]45[C@H]3CC[C@H](C4)C(=C)C5)C(=O)O)OC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29497',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of pyranopyrroles with formula C18H23NO4, originally isolated from Aspergillus niger. It has a role as an Aspergillus metabolite and a marine metabolite. It is a gamma-lactam, an enol, a pyranopyrrole and a cyclic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C/C=C/C1=C(C(=O)C2=C(O1)C(N(C2=O)C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28197',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hydroxy fatty acid ascaroside anion resulting from the deprotonation of the carboxy group of oscr#3. The conjugate base of oscr#3 and the major species at pH 7.3. It is a conjugate base of an oscr#3.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCC/C=C/C(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21603',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an alkylglucosinolic acid that consists of 1-thio-beta-D-glucopyranose attached to an -N-(sulfooxy)propanimidoyl group at the anomeric sulfur. It occurs in Lepidium sativum (garden cress) and Armoracia lapathifolia (horseraddish). A flavour component; the hydrolysis product, ethyl isothiocyanate, is very pungent and garlic-like.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C(=N/OS(=O)(=O)O)/S[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2625',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an amino trisaccharide that is D-galactopyranose in which the hydroxy groups at positions 3 and 4 have each been glycosylated by a 2-acetamido-2-deoxy-beta-D-glucopyranosyl group. It is an amino trisaccharide and a member of acetamides. It derives from a beta-D-GlcpNAc-(1->3)-D-Galp and a beta-D-GlcpNAc-(1->4)-D-Galp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@H]2[C@H](OC([C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)NC(=O)C)O)O)CO)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8803',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an azamacrocyle in which four nitrogen atoms at positions 1, 4, 8 and 11 of a fouteen-membered ring are each substituted with a carboxymethyl group. It has a role as a chelator. It derives from a hydride of a 1,4,8,11-tetraazacyclotetradecane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CN(CCN(CCCN(CCN(C1)CC(=O)O)CC(=O)O)CC(=O)O)CC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1425',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH groups of Ni(II)-pyridinium-3,5-bisthiocarboxylic acid mononucleotide; major species at pH 7.3. It has a role as a cofactor. It is a conjugate base of a Ni(II)-pyridinium-3,5-bisthiocarboxylic acid mononucleotide.\\nThe corresponding SMILES representation is:\\nC1=C([C-]=C(C=[N+]1[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])[O-])O)O)C(=O)[S-])C(=[SH+])O.[Ni]\\nThe natural language question is: The molecule is an aminotrisaccharide consisting of beta-D-galactopyranose, 2-acetamido-2-deoxy-beta-D-glucopyranose and L-furopyranose residues joined in sequence by (1->4) and (1->3) glycosidic bonds. It is an amino trisaccharide and a member of acetamides. It derives from a N-acetyllactosamine.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H](C(O1)O)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)NC(=O)C)O\\nThe natural language question is: The molecule is a phenylpropanoid that is the acetate ester of trans-isoeugenol. It is a phenylpropanoid, a monomethoxybenzene and a member of phenyl acetates. It derives from a trans-isoeugenol.\\nThe corresponding SMILES representation is:\\nC/C=C/C1=CC(=C(C=C1)OC(=O)C)OC\\nThe natural language question is: The molecule is a glycosyl alditol consisting of beta-D-galactofuranose and D-altritol residues joined in sequence by a (1->4) glycosidic bond. It derives from a beta-D-galactofuranose and a D-altritol.\\nThe corresponding SMILES representation is:\\nC([C@H]([C@H]1[C@@H]([C@H]([C@@H](O1)O[C@H]([C@@H](CO)O)[C@H]([C@@H](CO)O)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 1,3-oxazole compound having a phenyl substituent at the 2-position, an ethoxymethylene group at the 4-position, and an oxo group at the 5-position. It is a chemical allergen used for immunological experiments, particularly for experiments on delayed type hypersensitivity. It has a role as an allergen. It is a member of 1,3-oxazoles and a gamma-lactone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCO/C=C/1\\\\\\\\C(=O)OC(=N1)C2=CC=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9757',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an oligosaccharide derivative consisting of -D-GalNAc-ol at the reducing end with a D-GalNAc-(1->3)-D-GalNAc-(1->3)-[D-GalNAc-(1->2)-L-Fuc-(1->4)]-D-GlcNAc moiety attached via a (1->3)-linkage and a L-Fuc residue attached via a (1->6)-linkage.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H](C(O1)OC[C@H]([C@@H]([C@@H]([C@H](CO)NC(=O)C)OC2[C@@H]([C@H]([C@@H]([C@H](O2)CO)OC3[C@H]([C@@H]([C@@H]([C@@H](O3)C)O)O)OC4[C@@H]([C@H]([C@H]([C@H](O4)CO)O)O)NC(=O)C)OC5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)OC6[C@@H]([C@H]([C@H]([C@H](O6)CO)O)O)NC(=O)C)NC(=O)C)NC(=O)C)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10033',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a trimethoxyflavone that is the 3',4',5'-tri-O-methyl ether of tricetin. It is a trimethoxyflavone, a dihydroxyflavone and a 3',5'-dimethoxyflavone. It derives from a tricetin. It is a conjugate acid of a 3',4',5'-O-trimethyltricetin(1-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC(=CC(=C1OC)OC)C2=CC(=O)C3=C(C=C(C=C3O2)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21776',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a non-proteinogenic L-alpha-amino acid that is L-homoserine in which the hydroxy group at position 4 is substituted by an aminooxy group. It has been isolated from legumes and plays an essential role in lugume chemical defense against insects. It has a role as a plant metabolite, an antineoplastic agent, an antimetabolite and a phytogenic insecticide. It derives from a L-homoserine. It is a tautomer of a L-canaline zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CON)[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2499',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a glycosyl alditol derivative consisting of N-acetyllactosamine and N-acetyl-D-galactosaminitol joined in sequence by a (1->6) glycosidic bond. It is a glycosyl alditol derivative, a glycoside and a member of acetamides. It derives from a N-acetyl-D-galactosaminitol and a N-acetyllactosamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC[C@H]([C@@H]([C@@H]([C@H](CO)NC(=O)C)O)O)O)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20348',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a member of the class of coumestans that is coumestrol with a prenyl substituent at position 10. It has a role as a plant metabolite. It is a member of coumestans, an olefinic compound, an organic heterotetracyclic compound, a delta-lactone and a polyphenol. It derives from a coumestrol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC1=C(C=CC2=C1OC3=C2C(=O)OC4=C3C=CC(=C4)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16088',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a BODIPY dye having a phenyl substituent at the 5-position and a (2-carboxyethyl) substituent at the 3-position. It has a role as a fluorochrome. It is a BODIPY dye and a monocarboxylic acid. It derives from a 4,4-difluoro-4-bora-3a,4a-diaza-s-indacene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[B-]1(N2C(=CC=C2CCC(=O)O)C=C3[N+]1=C(C=C3)C4=CC=CC=C4)(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23473',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pyridazinium ion that is pyridazin-1-ium which is substituted by a phenyl, amino and methoxy groups at positions 1, 4 and 6, respectively. It has a role as an antihypotensive agent, an EC 1.4.3.4 (monoamine oxidase) inhibitor, a sympathomimetic agent and an adrenergic uptake inhibitor. It is an aromatic ether, a primary arylamine and a pyridazinium ion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=[N+](N=CC(=C1)N)C2=CC=CC=C2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18262',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a rhodamine 6G(1+), an organic chloride salt and a xanthene dye. It has a role as a fluorochrome.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCNC1=CC2=C(C=C1C)C(=C3C=C(C(=[NH+]CC)C=C3O2)C)C4=CC=CC=C4C(=O)OCC.[Cl-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1557',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a polyunsaturated fatty acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (9Z,12Z)-18-hydroxyoctadecadienoyl-CoA; major species at pH 7.3. It is a long-chain fatty acyl-CoA(4-), an omega-hydroxy fatty acyl-CoA(4-) and a polyunsaturated fatty acyl-CoA(4-). It is a conjugate base of a (9Z,12Z)-18-hydroxyoctadecadienoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14735',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a glycosylglucose consisting of beta-L-fucopyranose and alpha-D-glucopyranose residues joined in sequence by a (1->4) glycosidic bond. It derives from a beta-L-fucose and an alpha-D-glucose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@H](O1)O[C@@H]2[C@H](O[C@@H]([C@@H]([C@H]2O)O)O)CO)O)O)O\\nThe natural language question is: The molecule is a D-methionine derivative that is the amide obtained by formal condensation of the carboxy group of D-methionine with the amino group of 2-naphthylamine. It is a 2-amino-4-(methylsulfanyl)-N-(2-naphthyl)butanamide and a D-methionine derivative. It is an enantiomer of a L-methionine 2-naphthylamide.\\nThe corresponding SMILES representation is:\\nCSCC[C@H](C(=O)NC1=CC2=CC=CC=C2C=C1)N\\nThe natural language question is: The molecule is a synthetic 25-membered heterodetic cyclic peptide consisting of a 14-membered linear component attached to an 11-membered cyclic portion. A CK2 inhibitor with potential antineoplastic activity. It has a role as an apoptosis inducer, an antineoplastic agent, an EC 2.7.11.1 (non-specific serine/threonine protein kinase) inhibitor and an angiogenesis modulating agent. It is a heterodetic cyclic peptide and a polypeptide.\\nThe corresponding SMILES representation is:\\nC[C@H]([C@H]1C(=O)N[C@@H](CSSC[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N2CCC[C@H]2C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)NCC(=O)N1)CC(C)C)CC3=CNC=N3)CCCNC(=N)N)CO)CCSC)CC4=CNC5=CC=CC=C54)NC(=O)CCNC(=O)[C@H](CCC(=O)N)NC(=O)[C@@H]6CCCN6C(=O)[C@@H]7CCCN7C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCNC(=N)N)NC(=O)CN)C(=O)O)O\\nThe natural language question is: The molecule is an anthocyanin cation that is the 3-O-beta-D-glucoside of peonidin (methylcyanidin). It has a role as an antioxidant and a plant metabolite. It is an anthocyanin cation, a beta-D-glucoside and a monosaccharide derivative. It derives from a peonidin. It is a conjugate acid of a peonidin 3-O-beta-D-glucoside betaine.\\nThe corresponding SMILES representation is:\\nCOC1=C(C=CC(=C1)C2=[O+]C3=CC(=CC(=C3C=C2O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an apo carotenoid triterpenoid that is tetracosane containing double bonds at the 2-3, 6-7, 8-9, 10-11, 12-13, 14-15, 16-17, 18-19, and 22-23 positions, and substituted by methyl groups at positions 2, 6, 10, 15, 19, and 23. It is an apo carotenoid triterpenoid, a triterpene and a polyene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/C=C/C(=C/C=C/C=C(/C=C/C=C(/CCC=C(C)C)\\\\\\\\C)\\\\\\\\C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16290',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a polyunsaturated fatty acid anion that is the conjugate base of 12(S)-HPE(5,8,10)TrE, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a polyunsaturated fatty acid anion, a long-chain fatty acid anion, a hydroperoxy fatty acid anion, a hydroperoxyicosatrienoate and a hydroperoxy polyunsaturated fatty acid anion. It derives from a (5Z,8Z,11Z)-icosatrienoate. It is a conjugate base of a 12(S)-HPE(5,8,10)TrE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC[C@@H](/C=C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)[O-])OO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9367',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a sphingomyelin 38:1 obtained by formal condensation of the carboxy group of henicosanoic acid with the amino group of 14-methylhexadecasphingosine-1-phosphocholine. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It derives from a 14-methylhexadecasphingosine and a henicosanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCC(C)CC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13999',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a 3-[alpha-D-galactosyl-(1->6)-beta-D-galactosyl]-1,2-diacyl-sn-glycerol in which the 1- and 2-acyl groups are both specified as alpha-linolenoyl. It has a role as a Brassica napus metabolite. It derives from an alpha-linolenic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](CO[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO[C@@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O)O)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13724',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organic cation obtained by protonation of the four free amino groups of amikacin; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of an amikacin.\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CC[NH3+])O)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)[NH3+])O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C[NH3+])O)O)O)[NH3+]\\nThe natural language question is: The molecule is a purine ribonucleoside 5'-monophosphate that is AMP substituted at position N-6 by a (2E)-4-hydroxy-3-methylbut-2-en-1-yl group. It has a role as a plant metabolite. It is an adenosine 5'-phosphate, a purine ribonucleoside 5'-monophosphate and a N-glycosylzeatin. It derives from an adenosine 5'-monophosphate. It is a conjugate acid of a 9-ribosyl-trans-zeatin 5'-phosphate(2-).\\nThe corresponding SMILES representation is:\\nC/C(=C\\\\\\\\CNC1=C2C(=NC=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)O)O)O)/CO\\nThe natural language question is: The molecule is an alpha-amino-acid anion. It is a conjugate base of a selenocysteine. It is a conjugate acid of a selenocysteinate(2-).\\nThe corresponding SMILES representation is:\\nC(C(C(=O)[O-])N)[Se]\\nThe natural language question is: The molecule is a disaccharide consisting of L-threo-hex-4-enopyranuronose and 2-(sulfoamino)-D-glucopyranose residues joined in sequence by a (1->4) glycosidic bond. It derives from a 4,5-dehydro-D-glucuronic acid.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)NS(=O)(=O)O)O)OC2[C@@H]([C@H](C(=C(O2)C(=O)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organic cation obtained by protonation of the secondary amino function of N-(2,3-dihydro-1,4-benzodioxin-2-ylmethyl)-2-(2,6-dimethoxyphenoxy)ethanamine. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a N-(2,3-dihydro-1,4-benzodioxin-2-ylmethyl)-2-(2,6-dimethoxyphenoxy)ethanamine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=C(C(=CC=C1)OC)OCC[NH2+]CC2COC3=CC=CC=C3O2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27845',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a dicarboxylic acid dianion obtained via deprotonation of both carboxy groups of 2-oxo-2H-pyran-4,6-dicarboxylic acid; major species at pH 7.3. It is a conjugate base of a 2-oxo-2H-pyran-4,6-dicarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(C=C(OC1=O)C(=O)[O-])C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10409',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a dicarboxylic acid monoester resulting from the formal condensation of one of the carboxylic acid groups of crocetin with the anomeric hydroxy group of beta-D-gentiobiose. It is a dicarboxylic acid monoester, a glycoside and a disaccharide derivative. It derives from a crocetin and a gentiobiose. It is a conjugate acid of a beta-D-gentiobiosyl crocetin(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=C\\\\\\\\C=C\\\\\\\\C=C(/C)\\\\\\\\C=C\\\\\\\\C=C(/C)\\\\\\\\C(=O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)O)O)/C=C/C=C(\\\\\\\\C)/C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2662',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the iminium cation of malachite green isothiocyanate. It has a role as a fluorochrome. It is a tertiary amine and an iminium ion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN(C)C1=CC=C(C=C1)C(=C2C=CC(=[N+](C)C)C=C2)C3=CC=C(C=C3)N=C=S'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13458',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an oxo carboxylic acid anion that is the conjugate base of 5-(3'-carboxy-3'-oxopropyl)-4,6-dihydroxypicolinic acid. It is a conjugate base of a 5-(3'-carboxy-3'-oxopropyl)-4,6-dihydroxypicolinic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=C(NC(=O)C(=C1[O-])CCC(=O)C(=O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20247',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a carboxamide resulting from the formal condensation of the primary amino group of 4-propanoyl-1,3-thiazol-2-amine with the carboxy group of a 3-phenylbutiyric acid which has been substituted at position 2 by a (4R)-4-[p-(2-hydroxyethoxy)phenyl]-2,5-dioxoimidazolidin-1-yl group. It is a potent, highly selective inhibitormitogen-activated protein kinase 1/2 inhibitor. It has a role as an EC 2.7.11.24 (mitogen-activated protein kinase) inhibitor. It is a member of 1,3-thiazoles, an imidazolidine-2,4-dione, an aromatic ketone and a secondary carboxamide.\\nThe corresponding SMILES representation is:\\nCCC(=O)C1=CSC(=N1)NC(=O)[C@H]([C@@H](C)C2=CC=CC=C2)N3C(=O)[C@H](NC3=O)C4=CC=C(C=C4)OCCO\\nThe natural language question is: The molecule is trianionic form of pyrroloquinoline quinol arising from deprotonation of the three carboxy groups. It is a tricarboxylic acid trianion and a pyrroloquinoline. It is a conjugate base of a pyrroloquinoline quinol. It is a conjugate acid of a pyrroloquinoline quinol(4-).\\nThe corresponding SMILES representation is:\\nC1=C(NC2=C(C(=C3C=C(N=C3C2=C1C(=O)[O-])C(=O)O)[O-])[O-])C(=O)O\\nThe natural language question is: The molecule is an optically active phosphonic acid having a 1-aminoethyl group attached to the phosphorus. It is a member of phosphonic acids and a primary amino compound. It derives from a phosphonic acid. It is an enantiomer of a (S)-(1-aminoethyl)phosphonic acid.\\nThe corresponding SMILES representation is:\\nC[C@H](N)P(=O)(O)O\\nThe natural language question is: The molecule is a member of the class of benzimidazoles that is the methyl ester of [1-(butylcarbamoyl)-1H-benzimidazol-2-yl]carbamic acid. A foliar fungicide used to control a wide range of Ascomycetes and Fungi Imperfecti in a wide range of crops. It has a role as an anthelminthic drug, a tubulin modulator, a microtubule-destabilising agent, an acaricide and an antifungal agrochemical. It is a member of benzimidazoles, a carbamate ester, an aromatic amide, a benzimidazole fungicide and a benzimidazolylcarbamate fungicide.\\nThe corresponding SMILES representation is:\\nCCCCNC(=O)N1C2=CC=CC=C2N=C1NC(=O)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a meroterpenoid resulting from the formal condensation of the hydroxy group of fumagillol with the carboxylic acid group of (all-E)-deca-2,4,6,8-tetraenedioic acid. Originally isolated from the fungus Aspergillus fumigatus, it is used for the control of Nosema infection in honey bees. It has a role as an angiogenesis inhibitor, an antibacterial drug, an antiprotozoal drug, a methionine aminopeptidase 2 inhibitor, an antimicrobial agent and a fungal metabolite. It is an organooxygen heterocyclic antibiotic, a meroterpenoid, a carboxylic ester, a dicarboxylic acid monoester, an antibiotic antifungal drug and a spiro-epoxide. It derives from a fumagillol and an (all-E)-deca-2,4,6,8-tetraenedioic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CC[C@@H]1[C@@](O1)(C)[C@H]2[C@@H]([C@@H](CC[C@]23CO3)OC(=O)/C=C/C=C/C=C/C=C/C(=O)O)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29346',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a primary nitroalkane that is hexane substituted by a nitro group at position 1. It has a role as a human urinary metabolite. It derives from a hydride of a hexane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26373',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a glycosyl alditol derivative consisting of D-galactopyranose, 2-acetamido-2-deoxy-D-glucopyranose and 2-acetamido-2-deoxy-D-galactitol residues joined in sequence by (1->4) and (1->3) glycosidic bonds. It is a glycosyl alditol derivative, a partially-defined glycan and a member of acetamides. It derives from a N-acetyl-D-galactosaminitol, a D-Galp-(1->4)-D-GlcpNAc and a 3-O-(N-acetyl-D-glucosaminyl)-N-acetyl-D-galactosaminitol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](OC1O[C@H]([C@H](CO)NC(=O)C)[C@H]([C@@H](CO)O)O)CO)OC2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11628',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a primary alpha-hydroxy ketone that is butane-1,2-diol in which the hydroxy group at position 2 has been formally oxidised to give the corresponding ketone. It derives from a butan-2-one and a butane-1,2-diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(=O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16348',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a CMP-sugar having 8-amino-3,8-dideoxy-alpha-D-manno-oct-2-ulosonic acid as the sugar component. It has a role as a bacterial metabolite. It derives from an 8-amino-3,8-dideoxy-alpha-D-manno-oct-2-ulosonic acid. It is a conjugate acid of a CMP-8-amino-3,8-dideoxy-alpha-D-manno-oct-2-ulosonate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@H]([C@H](O[C@]1(C(=O)O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=NC3=O)N)O)O)[C@@H](CN)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27807',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an N-acyl-1-O-beta-D-glucosyl-15-methylhexadecasphing-4-enine in which the acyl group has 23 carbons and 0 double bonds and is 2-hydroxylated. It derives from a 15-methylhexadecasphing-4-enine.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCC(C(=O)N[C@@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)[C@@H](/C=C/CCCCCCCCCC(C)C)O)O\\nThe natural language question is: The molecule is an organic sulfide that is the S-adenosyl derivative of L-homocysteine. It has a role as a cofactor, an EC 2.1.1.79 (cyclopropane-fatty-acyl-phospholipid synthase) inhibitor, an EC 2.1.1.72 [site-specific DNA-methyltransferase (adenine-specific)] inhibitor, a fundamental metabolite and an epitope. It is a member of adenosines, an organic sulfide, a homocysteine derivative and a member of homocysteines. It is a conjugate acid of a S-adenosyl-L-homocysteinate. It is a tautomer of a S-adenosyl-L-homocysteine zwitterion.\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)CSCC[C@@H](C(=O)O)N)O)O)N\\nThe natural language question is: The molecule is an alpha-amino acid that is valeric acid which is substituted at position 2 by an amino group. It derives from a valeric acid.\\nThe corresponding SMILES representation is:\\nCCCC(C(=O)O)N\\nThe natural language question is: The molecule is a nucleotide-sugar oxoanion obtained by deprotonation of the diphosphate OH groups of dTDP-L-rhamnose; major species at pH 7.3. It is a conjugate base of a dTDP-L-rhamnose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H](C(O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a non-proteinogenic L-alpha-amino acid that is L-norleucine which is substituted at position 5 by an oxo group and at position 6 by a diazo group. It is as inhibitor of various glutamine-utilising enzymes. It has a role as a bacterial metabolite, an analgesic, an antibacterial agent, an antiviral agent, an antineoplastic agent, an EC 6.3.5.5 [carbamoyl-phosphate synthase (glutamine-hydrolysing)] inhibitor, an EC 6.3.4.2 [CTP synthase (glutamine hydrolyzing)] inhibitor, an EC 6.3.5.3 (phosphoribosylformylglycinamidine synthase) inhibitor, an EC 6.3.5.2 [GMP synthase (glutamine-hydrolysing)] inhibitor, an antimetabolite, a glutamine antagonist, an apoptosis inducer, an EC 2.4.2.14 (amidophosphoribosyltransferase) inhibitor, an EC 3.5.1.2 (glutaminase) inhibitor, an EC 6.3.5.1 [NAD(+) synthase (glutamine-hydrolysing)] inhibitor and an EC 6.3.5.4 [asparagine synthase (glutamine-hydrolysing)] inhibitor. It is a non-proteinogenic L-alpha-amino acid, a diazo compound and a ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)C=[N+]=[N-])[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7353',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a polycyclic aromatic hydrocarbon consisting of fused benzene rings in a rectilinear arrangement. It is an ortho-fused polycyclic arene and a member of acenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C=C3C=CC=CC3=CC2=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14255',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a tetrahydroxyflavone that is flavone substituted by hydroxy groups at positions 5, 7, 2' and 4', a prenyl group at position 3 and a 3-methylbut-1-en-3-yl group at position 8. It has been isolated from the twigs of Morus nigra and has been found to promote adipogenesis. It has a role as a metabolite and a plant metabolite.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=CCC1=C(OC2=C(C1=O)C(=CC(=C2C(C)(C)C=C)O)O)C3=C(C=C(C=C3)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10798',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of 4-carboxymethyl-3-methylbut-2-en-1,4-olide; major species at pH 7.3. It is a conjugate base of a 4-carboxymethyl-3-methylbut-2-en-1,4-olide.\\nThe corresponding SMILES representation is:\\nCC1=CC(=O)OC1CC(=O)[O-]\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of (S)-dihydrocamalexic acid, obtained by deprotonation of the carboxy group. It is a conjugate base of a (S)-dihydrocamalexic acid. It is an enantiomer of a (R)-dihydrocamalexate.\\nThe corresponding SMILES representation is:\\nC1[C@@H](N=C(S1)C2=CNC3=CC=CC=C32)C(=O)[O-]\\nThe natural language question is: The molecule is a pregnane-based steroidal hormone produced by the outer-section (zona glomerulosa) of the adrenal cortex in the adrenal gland, and acts on the distal tubules and collecting ducts of the kidney to cause the conservation of sodium, secretion of potassium, increased water retention, and increased blood pressure. The overall effect of aldosterone is to increase reabsorption of ions and water in the kidney. It has a role as a human metabolite and a mouse metabolite. It is an 11beta-hydroxy steroid, a 21-hydroxy steroid, a 18-oxo steroid, a 20-oxo steroid, a C21-steroid hormone, a steroid aldehyde, a 3-oxo-Delta(4) steroid, a primary alpha-hydroxy ketone and a mineralocorticoid. It derives from a hydride of a pregnane.\\nThe corresponding SMILES representation is:\\nC[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2[C@H](C[C@]4([C@H]3CC[C@@H]4C(=O)CO)C=O)O\\nThe natural language question is: The molecule is a prostaglandin carboxylic acid anion that is the conjugate base of prostaglandin J2, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a prostaglandin J2.\\nThe corresponding SMILES representation is:\\nCCCCC[C@@H](/C=C/[C@@H]1[C@H](C=CC1=O)C/C=C\\\\\\\\CCCC(=O)[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a pyrroline that is 1-pyrroline in which the hydrogen at position 2 is replaced by an acetyl group. It is an aroma and flavour compound present in jasmine rice and basmati rice. It is responsible for the 'popcorn' aroma in a large variety of cereal and food products. It is one of the key odourants of the crust of bread and considered to be responsible for the cracker-like odour properties. In bread, it is primarily generated during baking but amounts are influenced by ingredient composition and fermentation conditions. It has a role as a metabolite, a flavouring agent and a Maillard reaction product. It is a methyl ketone, a pyrroline and an acylimine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=O)C1=NCCC1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12868',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is dianion of xylitol 5-phosphate arising from deprotonation of the phosphate OH groups; major species at pH 7.3. It derives from a xylitol. It is a conjugate base of a xylitol 5-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]([C@H]([C@@H](COP(=O)([O-])[O-])O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28581',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is dianion of aldehydo-D-glucose 6-phosphate arising from deprotonation of the two OH groups of the phosphate. It is a conjugate base of an aldehydo-D-glucose 6-phosphate.\\nThe corresponding SMILES representation is:\\nC([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)OP(=O)([O-])[O-]\\nThe natural language question is: The molecule is a 1,2-diacyl-sn-glycerol that has myristoyl and oleoyl as 1- and 2-acyl groups respectively. It has a role as a mouse metabolite. It is a 1,2-diacyl-sn-glycerol, a 1-myristoyl-2-oleoylglycerol and a tetradecanoate ester. It is an enantiomer of a 2-oleoyl-3-myristoyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC(=O)OC[C@H](CO)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC\\nThe natural language question is: The molecule is an organic chloride salt comprising of a tetrabutylammonium cation and chloride anion. It is a tetrabutylammonium salt and an organic chloride salt.\\nThe corresponding SMILES representation is:\\nCCCC[N+](CCCC)(CCCC)CCCC.[Cl-]\\nThe natural language question is: The molecule is a carbohydrate-functionalised sequence-defined oligo(amidoamine) in which an Asn-Leu-Phe-Gln-Val-Val-His-Asn-Ser-Tyr-Asn-Arg-Pro-Ala-Tyr-Ser-Pro-Gly amino acid sequence is linked via its terminal glycine residue to the terminal amino group of a (2-{[2-(4-amino-4-oxobutanamido)ethyl]amino}ethyl)amino chain, to the -NH- group proximal to the amino-acid-linked amino group of which is also linked an alpha-L-rhamnosyl-(1->3)-beta-D-glucosyloxy disaccharide unit via a 5-(ethylsulfinyl)pentanoyl chain.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@H]2[C@@H]([C@H](O[C@H]([C@@H]2O)OCCS(=O)CCCCC(=O)N(CCNC(=O)CCC(=O)N)CCNC(=O)CNC(=O)[C@@H]3CCCN3C(=O)[C@H](CO)NC(=O)[C@H](CC4=CC=C(C=C4)O)NC(=O)[C@H](C)NC(=O)[C@@H]5CCCN5C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CC6=CC=C(C=C6)O)NC(=O)[C@H](CO)NC(=O)[C@H](CC(=O)N)NC(=O)[C@H](CC7=CNC=N7)NC(=O)[C@H](C(C)C)NC(=O)[C@H](C(C)C)NC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CC8=CC=CC=C8)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(=O)N)N)CO)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 2,2'-iminobis[1-(6-fluoro-3,4-dihydro-2H-chromen-2-yl)ethanol] that has (1R,1'R,2R,2'S)-configuration. It is a conjugate base of a (S,R,R,R)-nebivolol(1+). It is an enantiomer of a (R,S,S,S)-nebivolol.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1CC2=C(C=CC(=C2)F)O[C@H]1[C@@H](CNC[C@H]([C@@H]3CCC4=C(O3)C=CC(=C4)F)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27781',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of benzyl alcohols that is ethanol substituted by a phenyl group at position 1 which in turn is substituted by methoxy groups at positions 3 and 5 and a hydroxy group at position 4 respectively. It is a member of phenols, a dimethoxybenzene and a member of benzyl alcohols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C1=CC(=C(C(=C1)OC)O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5017',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a dihydroceramide obtained by formal condensation of the carboxy group of pentacosanoic acid with the amino group of 14-methylhexadecasphinganine. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It derives from a pentacosanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@@H](CCCCCCCCCCC(C)CC)O\\nThe natural language question is: The molecule is trianion of tri-trans,poly-cis-undecaprenyl diphosphate arising from deprotonation of the diphosphate OH groups; major species at pH 7.3. It is a conjugate base of a tri-trans,poly-cis-undecaprenyl diphosphate.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])OP(=O)([O-])[O-])/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C\\nThe natural language question is: The molecule is an ether that is dimethyl ether in which one of the hydrogens attached to each of the methyl group has replaced by a chlorine. It has a role as a carcinogenic agent and an alkylating agent. It is an ether and an organochlorine compound.\\nThe corresponding SMILES representation is:\\nC(OCCl)Cl\\nThe natural language question is: The molecule is the simplest member of the class of phenylethanolamines that is 2-aminoethanol bearing a phenyl substituent at the 1-position. The parent of the phenylethanolamine class. It has a role as a human metabolite. It is a conjugate base of a phenylethanolaminium.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)C(CN)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 1-acyl-sn-glycero-3-phospho-(1'-sn-glycerol)(1-) in which the acyl group is specified as hexadecanoyl (palmitoyl); major species at pH 7.3. It is a conjugate base of a 1-hexadecanoyl-sn-glycero-3-phospho-(1'-sn-glycerol).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OC[C@H](CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2873',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a nucleoside pentaphosphate that is adenosine in which a hexahydrogen pentaphosphate is attached at the 5' position. It is an adenosine 5'-phosphate and a nucleoside pentaphosphate. It is a conjugate acid of an adenosine 5'-pentaphosphate(6-).\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N\\nThe natural language question is: The molecule is a phenolate anion that is the conjugate base of xanthogalenol, obtained by deprotonation of the 1-hydroxy group. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of a xanthogalenol.\\nThe corresponding SMILES representation is:\\nCC(=CCC1=C(C=C(C(=C1[O-])C(=O)/C=C/C2=CC=C(C=C2)O)O)OC)C\\nThe natural language question is: The molecule is an organic sodium salt composed of iron(3+), sodium and 5-(oxidoimino)-6-oxo-5,6-dihydronaphthalene-2-sulfonate ions in a 1:1:3 ratio. Used to stain collagen and animal tissue. It has a role as a histological dye. It is an organic sodium salt and an iron coordination entity. It contains a 5-(oxidoimino)-6-oxo-5,6-dihydronaphthalene-2-sulfonate.\\nThe corresponding SMILES representation is:\\nC1=CC2=C(C=CC(=C2N=O)[O-])C=C1S(=O)(=O)[O-].C1=CC2=C(C=CC(=C2N=O)[O-])C=C1S(=O)(=O)[O-].C1=CC2=C(C=CC(=C2N=O)[O-])C=C1S(=O)(=O)[O-].[Na+].[Na+].[Na+].[Fe+3]\\nThe natural language question is: The molecule is an acyl-CoA(4-) that is the tetraanion of 3-hydroxybenzoyl-CoA arising from deprotonation of phosphate and diphosphate functions. It is a conjugate base of a 3-hydroxybenzoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)C4=CC(=CC=C4)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a prostaglandins A. It has a role as a human metabolite. It is a conjugate acid of a prostaglandin A2(1-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCC[C@@H](/C=C/[C@H]1C=CC(=O)[C@@H]1C/C=C\\\\\\\\CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5569',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a methanesulfonate salt obtained from pergolide by mixing eqimolar amount of pergolide and methanesulfonic acid. A dopamine D2 receptor agonist which also has D1 and D2 agonist properties, it is used in the management of Parkinson's disease, although it was withdrawn from the U.S. and Canadian markets in 2007 due to an increased risk of cardiac valve dysfunction. It has a role as an antiparkinson drug and a dopamine agonist. It contains a pergolide(1+).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCN1C[C@@H](C[C@H]2[C@H]1CC3=CNC4=CC=CC2=C34)CSC.CS(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10379',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a long-chain fatty acid anion that is the conjugate base of elaidic acid; shown to exert detrimental effects on mitochondrial lipid composition and function. It is a conjugate base of an elaidic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C/CCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27381',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 2-phenylcyclopropan-1-amine that is the (1S,2R)-enantiomer of tranylcypromine. It is a conjugate base of a (1S,2R)-tranylcypromine(1+). It is an enantiomer of a (1R,2S)-tranylcypromine.\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H]1N)C2=CC=CC=C2\\nThe natural language question is: The molecule is a glycosylgalactose consisting of alpha-L-fucopyranose and alpha-D-galactopyranose residues joined in sequence by a (1->2) glycosidic bond. It derives from an alpha-L-fucose and an alpha-D-galactose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@@H]2O)CO)O)O)O)O)O\\nThe natural language question is: The molecule is an anthraquinone that is knipholone in which the O-methyl group is replaced by a beta-D-glucopyranosyl group. It is isolated from the roots of Bulbine frutescens and exhibits trypanocidal and antiplasmodial activities. It has a role as a metabolite, an antiplasmodial drug and a trypanocidal drug. It is a beta-D-glucoside, a polyphenol, a methyl ketone, an aromatic ketone and a dihydroxyanthraquinone. It derives from a knipholone.\\nThe corresponding SMILES representation is:\\nCC1=CC(=C2C(=C1C3=C(C(=C(C=C3O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)C(=O)C)O)C(=O)C5=C(C2=O)C(=CC=C5)O)O\\nThe natural language question is: The molecule is a stilbenoid that is trans-stilbene substituted by hydroxy groups at positions 3 and 4' and methoxy groups at positions 2', 3' and 5. Isolated from Pholidota yunnanensis, it exhibits inhibitory effects on production of nitric oxide. It has a role as a metabolite and an EC 1.14.13.39 (nitric oxide synthase) inhibitor. It is a member of methoxybenzenes, a polyphenol and a stilbenoid.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1)O)/C=C/C2=C(C(=C(C=C2)O)OC)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is an enolate resulting from the deprotonation of the hydroxy group of the enol moiety of TAN-1612. It is a conjugate base of a TAN-1612.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=O)C1=C(C2(C(=O)C3=C(C4=C(C=C(C=C4C=C3CC2(CC1=O)O)OC)O)O)O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18241',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an ammonium betaine that is pipecolic acid zwitterion with methyl groups substituted for the two hydrogens at the nitrogen. It is found in in fruits, seeds, and leaves of orange, lemon, and bergamot. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[N+]1(CCCCC1C(=O)[O-])C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19865',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an amino tetrasaccharide comprising an alpha-sialyl residue, two N-acetyl-beta-D-glucosaminyl residues and an N-acetyl-D-glucosamine residue linked in a (2->6), (1->4) and (1->4) sequence. It has a role as an epitope. It is an amino tetrasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H](C[C@@](O[C@H]1[C@@H]([C@@H](CO)O)O)(C(=O)O)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O)NC(=O)C)O[C@@H]4[C@H](OC([C@@H]([C@H]4O)NC(=O)C)O)CO)CO)NC(=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11999',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a sesquiterpenoid that is (1E,4E,8E)-alpha-humulene which is substituted by a hydroxy group at the carbon atom attached to two double bonds (position 8). It is a sesquiterpenoid and a secondary alcohol. It derives from a hydride of a humulane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C/1=C\\\\\\\\CC(/C=C/C(/C(=C/CC1)/C)O)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26290',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an acyclic mixed acid anhydride that results from the formal condensation of the phosphoryl group of AMP with the carboxyl group of 2-amino-3-hydroxy-4-methylbenzoic acid. It has a role as a bacterial metabolite. It is an acyclic mixed acid anhydride and a purine ribonucleoside 5'-monophosphate. It derives from an adenosine 5'-monophosphate and a 3-hydroxy-4-methylanthranilic acid. It is a conjugate acid of a 2-amino-3-hydroxy-4-methylbenzoyl-AMP(1-).\\nThe corresponding SMILES representation is:\\nCC1=C(C(=C(C=C1)C(=O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)O)N)O\\nThe natural language question is: The molecule is a phosphatidylcholine 40:7 in which the acyl groups at positions 1 and 2 are specified as (9Z)-octadecenoyl and (4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl respectively. It has a role as a mouse metabolite. It derives from an all-cis-docosa-4,7,10,13,16,19-hexaenoic acid and an oleic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC\\nThe natural language question is: The molecule is a D-galactose 6-sulfate that has alpha configuration at the anomeric centre. It is a D-galactose 6-sulfate and a monosaccharide sulfate. It derives from a D-galactose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)OS(=O)(=O)O\\nThe natural language question is: The molecule is an ethoxybenzoic acid carrying an ethoxy substituent at position 3. It derives from a benzoic acid. It is a conjugate acid of a 3-ethoxybenzoate.\\nThe corresponding SMILES representation is:\\nCCOC1=CC=CC(=C1)C(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a D-mannose 1-phosphate with an alpha-configuration at the anomeric position. It is a D-mannose 1-phosphate and an alpha-D-hexose 1-phosphate. It derives from an alpha-D-mannose. It is a conjugate acid of an alpha-D-mannose 1-phosphate(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)OP(=O)(O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6607',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a member of the class of chromenes that is 2H-1-benzopyran substituted by methyl groups at positions 2 and 2, an ethyl group at position 4 and a 4-methoxyphenyl group at position 3 respectively. It is a member of pyrrolidines, a monomethoxybenzene, a member of phenols and a member of chromenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=C(C(OC2=C1C=CC(=C2CN3CCCC3)O)(C)C)C4=CC=C(C=C4)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19641',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a carboxylic ester resulting from the formal condensation of the carboxy group of N-benzoyl-L-phenylalanine with the hydroxy group of N-benzoyl-L-phenylalaninol. A metabolite found in several Pencillium and Aspergillus species, as well as in plants as a product of endophytic fungi. It has a role as an antineoplastic agent. It is a L-phenylalanine derivative, a member of benzamides and a carboxylic ester. It derives from a N-benzoyl-L-phenylalanine and a N-benzoyl-L-phenylalaninol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C[C@@H](COC(=O)[C@H](CC2=CC=CC=C2)NC(=O)C3=CC=CC=C3)NC(=O)C4=CC=CC=C4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22661',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a branched oligosaccharide consisting of six D-galactose residues linked alpha(1->6), with the second and fifth residues being branched at O-2 to alpha-L-Araf-(1->5)-alpha-L-Araf and alpha-L-Araf residues respectively.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)OC[C@@H]2[C@@H]([C@@H]([C@H]([C@@H](O2)OC[C@@H]3[C@@H]([C@@H]([C@H]([C@@H](O3)OC[C@@H]4[C@@H]([C@@H]([C@H]([C@@H](O4)OC[C@@H]5[C@@H]([C@@H]([C@H]([C@@H](O5)OC[C@H]([C@@H]([C@@H]([C@H](C=O)O)O)O)O)O[C@H]6[C@@H]([C@H]([C@@H](O6)CO)O)O)O)O)O)O)O)O)O)O)O[C@H]7[C@@H]([C@H]([C@@H](O7)CO[C@H]8[C@@H]([C@H]([C@@H](O8)CO)O)O)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2783',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a carboxylic ester resulting from the formal condensation of the carboxy group of N-[(S)-{[(2R,3S,4R,5R)-5-(4-aminopyrrolo[2,1-f][1,2,4]triazin-7-yl)-5-cyano-3,4-dihydroxytetrahydrofuran-2-yl]methoxy}(phenoxy)phosphoryl]-L-alanine with the hydroxy group of 2-ethylbutan-1-ol. A broad-spectrum antiviral prodrug with potent in vitro antiviral activity against a diverse panel of RNA viruses such as Ebola virus, MERS-CoV and SARS-CoV. It is currently in Phase III clinical trials for the treatment of Covid-19 in adults. It has a role as an antiviral drug, a prodrug and an anticoronaviral agent. It is a carboxylic ester, a pyrrolotriazine, a nitrile, a phosphoramidate ester, a C-nucleoside and an aromatic amine. It derives from a GS-441524.\\nThe corresponding SMILES representation is:\\nCCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@@H]1[C@H]([C@H]([C@](O1)(C#N)C2=CC=C3N2N=CN=C3N)O)O)OC4=CC=CC=C4\\nThe natural language question is: The molecule is an organic sodium salt that is the tetrasodium salt of cangrelor. Used as an intravenous antiplatelet drug that prevents formation of harmful blood clots in the coronary arteries. It has a role as a platelet aggregation inhibitor and a P2Y12 receptor antagonist. It contains a cangrelor(4-).\\nThe corresponding SMILES representation is:\\nCSCCNC1=C2C(=NC(=N1)SCCC(F)(F)F)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)(C(P(=O)([O-])[O-])(Cl)Cl)[O-])O)O.[Na+].[Na+].[Na+].[Na+]\\nThe natural language question is: The molecule is a member of the class of piperidine in which the hydrogen attached to the nitrogen atom is substituted by a 2-hydroxyethyl group. It is a member of piperidines, a tertiary amino compound, a primary alcohol and a member of ethanolamines.\\nThe corresponding SMILES representation is:\\nC1CCN(CC1)CCO\\nThe natural language question is: The molecule is a UDP-D-galactose(2-) in which the anomeric centre of the galactose moiety has alpha-configuration. It is an UDP-D-galactose(2-) and an UDP-monosaccharide(2-). It is a conjugate base of an UDP-alpha-D-galactose.\\nThe corresponding SMILES representation is:\\nC1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])OP(=O)([O-])O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of 2-pyranones that is 2H-pyran-2-one substituted by a methoxy group at position 4 and a 3,5-dimethyl-6-phenylhexa-1,3,5-trien-1-yl group at position 6 (the 1E,3Z,5E stereoisomer). It has been isolated from an endophytic fungus Aspergillus niger. It has a role as an Aspergillus metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=C/C(=C/C1=CC=CC=C1)/C)/C=C/C2=CC(=CC(=O)O2)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12823',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an amino acid zwitterion of L-homocitrulline arising from transfer of a proton from the carboxy to the amino group; major species at pH 7.3. It has a role as a human metabolite and a mouse metabolite. It is a tautomer of a L-homocitrulline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCNC(=O)N)C[C@@H](C(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15302',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is the ethyl ester of mandelic acid. It is an ethyl ester and a secondary alcohol. It derives from a mandelic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCOC(=O)C(C1=CC=CC=C1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19218',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an L-lysine derivative in which the N(6) of the lysine is substituted with a 4-aminobutyl group. It has a role as a human metabolite. It is a non-proteinogenic L-alpha-amino acid and a L-lysine derivative. It derives from a hypusine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCNCCCCN)C[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21839',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a polyanionic polymer obtained by deprotonation of the carboxy groups of [(1->2)-alpha-L-rhamnosyl-(1->4)-alpha-D-galacturonosyl]n; major species at pH 7.3. It is a polyanionic polymer and a carbohydrate acid anion.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@@H]([C@H]([C@H](O[C@@H]2C(=O)[O-])O)O)O)O)O)O\\nThe natural language question is: The molecule is a trisaccharide consisting of beta-L-fucopyranose and D-glucopyranose residues joined in sequence by a (1->2) glycosidic bond, in which the hydroxy group at position 3 of the glucopyranose moiety has been glycosylated by beta-D-galactopyranose.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](OC2O)CO)O)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O)O\\nThe natural language question is: The molecule is a steroid saponin isolated from Ornithogalum saundersiae and Galtonia candicans and has been shown to exhibit cytotoxic activity. It has a role as a metabolite and an antineoplastic agent. It is a beta-D-glucoside, a 17-hydroxy steroid, an acetate ester and a cinnamate ester. It derives from a trans-cinnamic acid.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)CCC(C)C)[C@]1([C@H](C[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC=C4[C@@]3(CC[C@@H](C4)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O)C)C)O[C@H]6[C@@H]([C@H]([C@H](CO6)O)O[C@H]7[C@@H]([C@H]([C@@H](CO7)O)O)OC(=O)/C=C/C8=CC=CC=C8)OC(=O)C)O\\nThe natural language question is: The molecule is a trienoic fatty acid consisting of (8Z,11Z,14Z)-icosa-8,11,14-trienoic acid having additional (7R)-hydroxy- and (5S,6S)-epoxy groups. It is an epoxy fatty acid, a hydroxy fatty acid, a polyunsaturated fatty acid, a long-chain fatty acid and an icosanoid. It derives from an all-cis-icosa-8,11,14-trienoic acid. It is a conjugate acid of a (7R)-hydroxy-(5S,6S)-epoxy-(8Z,11Z,14Z)-icosatrienoate.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\[C@@H]([C@@H]1[C@@H](O1)CCCC(=O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a dimeric naphthopyran with formula C32H30O12, isolated from several Aspergillus species. It has a role as an Aspergillus metabolite and a marine metabolite. It is a biaryl, a polyphenol, an aromatic ether, a cyclic hemiketal, an aromatic ketone, a cyclic ketone and a naphtho-gamma-pyrone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1(CC(=O)C2=C(O1)C=C3C=C(C(=C(C3=C2O)OC)C4=C5C(=C(C6=C4C=C(C=C6OC)OC)O)C(=O)CC(O5)(C)O)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18123',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of 1-benzofurans and a carbamate ester. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor, a carbamate insecticide, an acaricide, an agrochemical and a nematicide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCN(CCCC)SN(C)C(=O)OC1=CC=CC2=C1OC(C2)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6529',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an L-alpha-amino acid zwitterion obtained by transfer of a proton from the carboxy to the amino group of 4-chloro-L-lysine It is a conjugate base of a 4-chloro-L-lysinium. It is a tautomer of a 4-chloro-L-lysine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CN)C(C[C@@H](C(=O)[O-])[NH3+])Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29046',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is zwitterionic form of S-inosyl-L-homocysteine arising from transfer of a proton from the carboxy to the amino group; major species at pH 7.3. It is a tautomer of a S-inosyl-L-homocysteine.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(C(=O)N1)N=CN2[C@H]3[C@@H]([C@@H]([C@H](O3)CSCC[C@@H](C(=O)[O-])[NH3+])O)O\\nThe natural language question is: The molecule is an alkylcob(III)alamin and a member of cob(III)alamins. It has a role as a human metabolite. It is a conjugate acid of a hydroxocobalamin.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C=C1C)N(C=N2)[C@@H]3[C@@H]([C@@H]([C@H](O3)CO)OP(=O)([O-])O[C@H](C)CNC(=O)CC[C@@]\\\\\\\\4([C@H]([C@@H]5[C@]6([C@@]([C@@H](C(=N6)/C(=C\\\\\\\\7/[C@@]([C@@H](C(=N7)/C=C\\\\\\\\8/C([C@@H](C(=N8)/C(=C4\\\\\\\\[N-]5)/C)CCC(=O)N)(C)C)CCC(=O)N)(C)CC(=O)N)/C)CCC(=O)N)(C)CC(=O)N)C)CC(=O)N)C)O.[OH3+].[Co]\\nThe natural language question is: The molecule is a carboxylic ester obtained by formal condensation of the carboxy group of phthalimidoacetic acid with the phenolic hydroxy group of 4-nitrophenol. It is a C-nitro compound, a carboxylic ester and a member of phthalimides. It derives from a 4-nitrophenol, a glycine and a phthalimide.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C(=O)N(C2=O)CC(=O)OC3=CC=C(C=C3)[N+](=O)[O-]\\nThe natural language question is: The molecule is a methanesulfonate ester that is butane-1,4-diol in which the hydrogens of the hydroxy groups are replaced by methanesulfonyl groups. An alkylating antineoplastic agent, it is used for the treatment of chronic myeloid leukemia (although it has been largely replaced by newer drugs). It is also used as an insect sterilant. It has a role as an insect sterilant, an antineoplastic agent, a teratogenic agent, a carcinogenic agent and an alkylating agent. It derives from a butane-1,4-diol.\\nThe corresponding SMILES representation is:\\nCS(=O)(=O)OCCCCOS(=O)(=O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a ribose monophosphate that is 4-(beta-D-ribofuranosyl)aminobenzene carrying a single monophospate substituent at position 5'. It is a ribose monophosphate, a C-glycosyl compound and a substituted aniline. It is a conjugate acid of a 4-(beta-D-ribofuranosyl)aminobenzene 5'-phosphate(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=CC=C1[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O)O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27452',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a 4-hydroxy steroid that consists of 17beta-estradiol having an additional hydroxy group at position 4. It has a role as a metabolite. It derives from a 17beta-estradiol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=C3C=CC(=C4O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13701',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a labdane diterpenoid in which the labdane skeleton has double bonds at C-12 and C-14 (the former with Z-stereochemistry) and carries a hydroxy group at position C-8. It has a role as a metabolite. It is a labdane diterpenoid and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=C/C[C@@H]1[C@]2(CCCC([C@@H]2CC[C@@]1(C)O)(C)C)C)/C=C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18341',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a prostaglandin carboxylic acid anion that is the conjugate base of 20-hydroxyprostaglandin E1, obtained by deprotonation of the carboxy group; major species at pH 7.3. It derives from a prostaglandin E1(1-). It is a conjugate base of a 20-hydroxyprostaglandin E1.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H]([C@H](C1=O)CCCCCCC(=O)[O-])/C=C/[C@H](CCCCCO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17607',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a monohydroxyanthraquinone that is 9,10-anthraquinone in which the hydrogens at positions 1 and 2 are replaced by a hydroxy and an allyl group, respectively. It has a role as an EC 1.4.1.3 {glutamate dehydrogenase [NAD(P)(+)]} inhibitor.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C=CCC1=C(C2=C(C=C1)C(=O)C3=CC=CC=C3C2=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8263',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a bis(azo) compound that consists of a benzene core having two (4-amino-6-chloro-1,3,5-triazin-2-yl)amino groups attached at positions 1 and 4 and which in turn have 5-hydroxy-6-[(2-sulfophenyl)diazenyl]-2,7-disulfonaphthalen-4-yl groups attached to their 4-amino functions. It has a role as a dye. It is a bis(azo) compound, a member of azobenzenes, a chloro-1,3,5-triazine, a diamino-1,3,5-triazine and a naphthalenesulfonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C(=C1)N=NC2=C(C3=C(C=C(C=C3C=C2S(=O)(=O)O)S(=O)(=O)O)NC4=NC(=NC(=N4)NC5=CC=C(C=C5)NC6=NC(=NC(=N6)Cl)NC7=C8C(=CC(=C7)S(=O)(=O)O)C=C(C(=C8O)N=NC9=CC=CC=C9S(=O)(=O)O)S(=O)(=O)O)Cl)O)S(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12236',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a neoglycosphingolipid that is fluorinated alpha-C-GalCer, a synthesized C-clycosyl analogue of myelin-derived galactosylceramide (Mye-GalCer). It is a C-glycosyl compound, a neoglycosphingolipid and an organofluorine compound. It derives from an alpha-C-GalCer.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](C[C@H]([C@@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)F)[C@@H]([C@@H](CCCCCCCCCCCCCCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14251',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 1,2,4-triazole compound having a 3,5-bis(2-cyano-2-propyl)benzyl group at the 1-position. It has a role as an antineoplastic agent and an EC 1.14.14.14 (aromatase) inhibitor. It is a member of triazoles and a nitrile.\\nThe corresponding SMILES representation is:\\nCC(C)(C#N)C1=CC(=CC(=C1)CN2C=NC=N2)C(C)(C)C#N\\nThe natural language question is: The molecule is an organic cation obtained by protonation of the six free amino groups of neomycin C; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a neomycin C.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@@H]([C@H]([C@@H]([C@H]1[NH3+])O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)C[NH3+])O)O)[NH3+])O[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O[C@@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C[NH3+])O)O)[NH3+])O)O)[NH3+]\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (13Z,16Z,19Z,22Z,25Z)-3-oxooctacosapentaenoic acid. It is a 3-oxo-fatty acyl-CoA, an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It is a conjugate acid of a (13Z,16Z,19Z,22Z,25Z)-3-oxooctacosapentaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is the (S)-enantiomer of methylmalonyl-CoA. It has a role as an Escherichia coli metabolite and a mouse metabolite. It derives from a coenzyme A. It is a conjugate acid of a (S)-methylmalonyl-CoA(5-).\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)O)C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a phosphatidylethanolamine 36:2 obtained by transfer of a proton from the amino to the phosphate group of 1,2-dioleoyl-sn-glycero-3-phosphoethanolamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29691',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a pyrimidine nucleotide-sugar having thymine as the nucleobase and 4-dehydro-2,3,6-trideoxy-alpha-D-glucose as the sugar component. It has a role as a bacterial metabolite. It is a conjugate acid of a dTDP-4-dehydro-2,3,6-trideoxy-alpha-D-glucose(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C(=O)CC[C@H](O1)OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2510',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a dipeptide formed from L-threonine and L-proline residues. It has a role as a metabolite. It derives from a L-threonine and a L-proline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@@H](C(=O)N1CCC[C@H]1C(=O)O)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19302',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 24-residue single-stranded DNA oligonucleotide consisting of an alternating sequence of deoxycytidine and deoxyguanosine residues, all connected by 3'->5' phosphodiester linkages. It forms one strand of the double-stranded DNA oligonucleotide [d(C-G).d(C-G)]12 [synonym d(CG/GC)12].\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H](O[C@H]1N2C=NC3=C2N=C(NC3=O)N)COP(=O)(O)O[C@H]4C[C@@H](O[C@@H]4COP(=O)(O)O[C@H]5C[C@@H](O[C@@H]5COP(=O)(O)O[C@H]6C[C@@H](O[C@@H]6COP(=O)(O)O[C@H]7C[C@@H](O[C@@H]7COP(=O)(O)O[C@H]8C[C@@H](O[C@@H]8COP(=O)(O)O[C@H]9C[C@@H](O[C@@H]9COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1COP(=O)(O)O[C@H]1C[C@@H](O[C@@H]1CO)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)N1C=NC2=C1N=C(NC2=O)N)N1C=CC(=NC1=O)N)O\\nThe natural language question is: The molecule is the acetate ester of isoamylol. It has a role as a metabolite and a Saccharomyces cerevisiae metabolite. It derives from an isoamylol.\\nThe corresponding SMILES representation is:\\nCC(C)CCOC(=O)C\\nThe natural language question is: The molecule is a sulfoglycolipid in which alpha,alpha-trehalose, sulfated at the 2'-position, is acylated at the 2-position with palmitic acid, and at the 3-position with a hydroxyphthioceranoic acid (an octamethyl-branched dextrogyre C32 long chain where the stereochemistry at all methyl branches is L). It is a sulfoglycolipid and a polyacyl alpha,alpha-trehalose derivative. It derives from an alpha,alpha-trehalose.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC([C@H](C)C[C@H](C)C[C@H](C)C[C@H](C)C[C@H](C)C[C@H](C)C[C@H](C)C[C@H](C)C(=O)O[C@H]1[C@@H]([C@H](O[C@@H]([C@@H]1OC(=O)CCCCCCCCCCCCCCC)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)OS(=O)(=O)O)CO)O)O\\nThe natural language question is: The molecule is the dolichyl diphosphooligosaccharide(2-) species that is the dianion formed by loss of protons from the diphospho linkage in alpha-D-Glc-(1->3)-alpha-D-Man-(1->2)-alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->2)-alpha-D-Man-(1->6)]-alpha-D-Man-(1->6)]-beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc(PP-Dol); major microspecies at pH 7.3. It is a conjugate base of an alpha-D-Glc-(1->3)-alpha-D-Man-(1->2)-alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->2)-alpha-D-Man-(1->6)]-alpha-D-Man-(1->6)]-beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc(PP-Dol).\\nThe corresponding SMILES representation is:\\nCC(CC/C=C(/C)\\\\\\\\CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)CCOP(=O)([O-])OP(=O)([O-])OC1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)O)O[C@@H]7[C@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O[C@@H]8[C@H]([C@H]([C@@H]([C@H](O8)CO)O)O)O)O)O)O[C@@H]9[C@H]([C@H]([C@@H]([C@H](O9)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O)O)O)NC(=O)C)O)NC(=O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a triol that is pentane with the three hydroxy groups located at positions 1,3 and 5. It derives from a hydride of a pentane.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C(CO)C(CCO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22587',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a heptadienal that is hepta-2,4-dienal substituted by an oxo group at position 6 and a methyl group at position 2. It is a heptadienal, a methyl ketone and an enone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)/C=C/C=C(\\\\\\\\C)/C=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3776',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of catechols carrying two bromo substituents at positions 3 and 5. It has a role as an algal metabolite, a marine xenobiotic metabolite, a mouse metabolite and a bacterial xenobiotic metabolite. It is a member of catechols and a dibromobenzene.\\nThe corresponding SMILES representation is:\\nC1=C(C=C(C(=C1O)O)Br)Br\\nThe natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of 3-hydroxy-3-methylhexanoic acid, arising from deprotonation of the carboxy group. It is a hydroxy monocarboxylic acid anion and a 3-methyl fatty acid anion. It derives from a hexanoate. It is a conjugate base of a 3-hydroxy-3-methylhexanoic acid.\\nThe corresponding SMILES representation is:\\nCCCC(C)(CC(=O)[O-])O\\nThe natural language question is: The molecule is an amino acid zwitterion arising from transfer of a proton from the carboxy to the amino group of 5-hydroxy-L-tryptophan; major species at pH 7.3. It is a tautomer of a 5-hydroxy-L-tryptophan.\\nThe corresponding SMILES representation is:\\nC1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is an N-(2-naphthyl)carboxamide obtained by formal condensation of the carboxy group of L-seryl-L-tyrosine with the amino group of 2-naphthylamine. It has a role as a chromogenic compound. It is a N-(2-naphthyl)carboxamide and a dipeptide.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C=C(C=CC2=C1)NC(=O)[C@H](CC3=CC=C(C=C3)O)NC(=O)[C@H](CO)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 4-(1-hydroxy-2-{[4-(4-hydroxyphenyl)butan-2-yl]amino}ethyl)phenol in which both stereocentres have R configuration. It is the most active of the four diastereoisomers that make up the animal feed additive ractopamine. It has a role as a beta-adrenergic agonist and an animal growth promotant. It is an enantiomer of an ent-butopamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC1=CC=C(C=C1)O)NC[C@@H](C2=CC=C(C=C2)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13362',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of cardenolides that is 7,8-epoxycard-20(22)-enolide substituted by hydroxy groups at positions 3 and 14 (the 3beta,5beta,7beta stereoisomer). It has a role as an antineoplastic agent and a metabolite. It is a member of cardenolides, a secondary alcohol, a tertiary alcohol, an epoxy steroid, a 3beta-hydroxy steroid and a 14beta-hydroxy steroid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@@H]3[C@]4(CC[C@@H](C[C@H]4C[C@H]5[C@]3([C@]1(CC[C@@H]2C6=CC(=O)OC6)O)O5)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27194',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a tetramethylrhodium dye conjugated to the bicyclic peptide phalloidin via a thiourea linkage. It has a role as a fluorochrome. It derives from a phalloidin and a tetramethylrhodamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C(=O)N[C@H]2CC3=C(NC4=CC=CC=C34)SC[C@@H](C(=O)N5C[C@H](C[C@H]5C(=O)N1)O)NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](NC2=O)C[C@](C)(CNC(=S)/N=C/6\\\\\\\\C=CC(=C7C8=C(C=C(C=C8)N(C)C)OC9=C7C=CC(=C9)N(C)C)C(=C6)C(=O)O)O)C)[C@H](C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23864',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a hydroxy fatty acid ascaroside anion that is the conjugate base of oscr#36, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an oscr#36.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCCCCCCCCCCC(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7206',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organofluorine compound, an organochlorine compound, a member of quinazolines and a member of furans. It has a role as an antineoplastic agent and a tyrosine kinase inhibitor. It derives from a monofluorobenzene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3NC4=CC(=C(C=C4)OCC5=CC(=CC=C5)F)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12053',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an alpha,omega dicarboxyacyl-CoA that results from the formal condensation of the thiol group of coenzyme A with one of the carboxy groups of octanedioic acid. It derives from a suberic acid. It is a conjugate acid of an octanedioyl-CoA(5-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)CCCCCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10699',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of zofenoprilat, obtained by deprotonation of the carboxy group. It is a conjugate base of a zofenoprilat.\\nThe corresponding SMILES representation is:\\nC[C@H](CS)C(=O)N1C[C@H](C[C@H]1C(=O)[O-])SC2=CC=CC=C2\\nThe natural language question is: The molecule is a diphenylethane that is 1,2-dihydrostilbene substituted by hydroxy groups at positions 3, 4 and 4' and methoxy groups at C-5 and alpha-position respectively (the S stereoisomer). It is isolated from the stems of Dendrobium candidum and exhibits antioxidant activity. It has a role as a metabolite and a radical scavenger. It is a diphenylethane, a member of catechols and a member of methoxybenzenes. It derives from a hydride of a 1,2-dihydrostilbene.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC(=C1O)O)[C@H](CC2=CC=C(C=C2)O)OC\\nThe natural language question is: The molecule is a cyclic hexapeptide echinocandin antibiotic isolated from Aspergillus spp. By inhibiting the conversion of lanosterol to ergosterol, it invades a fungus' ability to synthesize cell walls. A modified form of echinocandin B, it is an antimycotic agent against Candida albicans. It has a role as an antiinfective agent. It is an echinocandin and an antibiotic antifungal drug.\\nThe corresponding SMILES representation is:\\nCCCCCCCCOC1=CC=C(C=C1)C(=O)N[C@H]2C[C@H]([C@H](NC(=O)[C@@H]3[C@H]([C@H](CN3C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H]4C[C@H](CN4C(=O)[C@@H](NC2=O)[C@@H](C)O)O)[C@@H]([C@H](C5=CC=C(C=C5)O)O)O)[C@@H](C)O)C)O)O)O\\nThe natural language question is: The molecule is a branched amino oligosaccharide comprising two fucosyl residues, three galactose residues and two N-acetylglucosamine residues, with a glucose residue at the reducing end, in the arrangement shown. It is an amino oligosaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)CO)OC[C@@H]4[C@@H]([C@@H]([C@H]([C@@H](O4)O[C@@H]5[C@H](OC([C@@H]([C@H]5O)O)O)CO)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO)O)O)O[C@H]8[C@H]([C@@H]([C@@H]([C@@H](O8)C)O)O)O)O)NC(=O)C)O)NC(=O)C)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is conjugate base of 2-oxopropyl-coenzyme M arising from deprotonation of the sulfonate function. It is a conjugate base of a 2-oxopropyl-CoM.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=O)CSCCS(=O)(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22316',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an anthraquinone that is knipholone in which the O-methyl group is replaced by a beta-D-glucopyranosyl group. It is isolated from the roots of Bulbine frutescens and exhibits trypanocidal and antiplasmodial activities. It has a role as a metabolite, an antiplasmodial drug and a trypanocidal drug. It is a beta-D-glucoside, a polyphenol, a methyl ketone, an aromatic ketone and a dihydroxyanthraquinone. It derives from a knipholone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=C2C(=C1C3=C(C(=C(C=C3O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)C(=O)C)O)C(=O)C5=C(C2=O)C(=CC=C5)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15264',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an azacycloalkane that is the 1,2-diaza derivative of cyclopentane It is a saturated organic heteromonocyclic parent, a member of pyrazolidines and an azacycloalkane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CNNC1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22855',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a monounsaturated fatty acid that is (2E)-hexenoic acid in which the hydrogen at position 3 has been replaced by a methyl group. A malodourous component in the sweat of schizophrenics. It is a short-chain fatty acid, a monounsaturated fatty acid and an alpha,beta-unsaturated monocarboxylic acid. It derives from a (2E)-hexenoic acid. It is a conjugate acid of a (2E)-3-methylhex-2-enoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC/C(=C/C(=O)O)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29079',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a member of the class of pyridines that is pyridine which is substituted at positions 2, 3, and 6 by (2-methoxyethoxy)methyl, (2-hydroxy-4-oxobicyclo[3.2.1]oct-2-en-3-yl)carbonyl, and trifluoromethyl groups, respectively. It is a broad-spectrum herbicide developed by Syngenta and used for the pre- and post-emergence control of weeds in corn. It has a role as a herbicide, an agrochemical and a carotenoid biosynthesis inhibitor. It is a member of pyridines, an aromatic ketone, a carbobicyclic compound, a beta-diketone, an organofluorine compound, an ether, an enol and an enone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COCCOCC1=C(C=CC(=N1)C(F)(F)F)C(=C2C(=O)[C@@H]3CC[C@@H](C3)C2=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15554',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a nucleoside 3',5'-cyclic phosphate that is cAMP in which the methine (C-H) group at position 8 on the purine fragment is replaced by nitrogen. It is a member of triazolopyrimidines and a nucleoside 3',5'-cyclic phosphate. It derives from an 8-azaguanine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]2[C@H]([C@H]([C@@H](O2)N3C4=NC=NC(=C4N=N3)N)O)OP(=O)(O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13911',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a polyprenyl phospho oligosaccharide that consists of a N-acetyl-alpha-D-galactosaminyl residue linked via a diphospho group to ditrans,polycis-undecaprenol. It is a conjugate acid of a N-acetyl-alpha-D-galactosaminyl-1-diphospho-ditrans,polycis-undecaprenol(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)(O)OP(=O)(O)O[C@@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27065',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pregnane sterol that is pregnenolone substituted by two hydroxy groups at positions 16 and 17 (16alpha,17alpha-stereoisomer). It is a hydroxypregnenolone, a pregnane sterol, a 16alpha-hydroxy steroid, a 17alpha-hydroxy steroid and a tertiary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)[C@]1([C@@H](C[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC=C4[C@@]3(CC[C@@H](C4)O)C)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15904',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a tetrapyrrole fundamental parent that is the core macrocycle of vitamin B12. It has a role as a cofactor. It is a member of corrins and a tetrapyrrole fundamental parent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC\\\\\\\\2=NC1C3CCC(=N3)/C=C\\\\\\\\4/CCC(=N4)/C=C\\\\\\\\5/CC/C(=C2)/N5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24193',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a member of quinolines, a dichlorobenzene, a monocarboxylic acid and a carboxylic ester. It contains a 2,6-dichlorobenzoyl group. It derives from a rac-lactic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1)Cl)C2=NC3=C(C=C2)C=C(C=C3)CC(C(=O)O)OC(=O)C4=C(C=CC=C4Cl)Cl)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7367',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a {2-[3-(4-chlorophenyl)propyl]-2,4,4-trimethyl-1,3-oxazolidin-3-yl}(imidazol-1-yl)methanone that is the (S)-enantiomer of oxpoconazole. It is a conjugate base of a (S)-oxpoconazole(1+). It is an enantiomer of a (R)-oxpoconazole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]1(N(C(CO1)(C)C)C(=O)N2C=CN=C2)CCCC3=CC=C(C=C3)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5004',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a cyclic ketone derived from spiro[4.5]dec-6-en-8-one by substitution of hydrogens by methyl groups at positions 6 and 10, and by an isopropenyl group at position 2 (the (2R,5S,10R)-diastereoisomer). It has a role as a phytoalexin and a plant metabolite. It is a spiro compound, a sesquiterpenoid and a cyclic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1CC(=O)C=C([C@]12CC[C@H](C2)C(=C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25538',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a long-chain alkane consisting of an unbranched chain of 34 carbon atoms. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4648',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is tetracarboxylate anion of coproporphyrinogen III. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a coproporphyrinogen III.\\nThe corresponding SMILES representation is:\\nCC1=C2CC3=C(C(=C(N3)CC4=C(C(=C(N4)CC5=C(C(=C(N5)CC(=C1CCC(=O)[O-])N2)C)CCC(=O)[O-])C)CCC(=O)[O-])CCC(=O)[O-])C\\nThe natural language question is: The molecule is an acyl-CoA(4-) that is the tetraanion of indol-3-ylacetyl-CoA, arising from deprotonation of the phosphate and diphosphate OH groups. It is a conjugate base of an indol-3-ylacetyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CC4=CNC5=CC=CC=C54)O\\nThe natural language question is: The molecule is the conjugate base of oxaluric acid; major species at pH 7.3. It is a conjugate base of an oxaluric acid.\\nThe corresponding SMILES representation is:\\nC(=O)(C(=O)[O-])NC(=O)N\\nThe natural language question is: The molecule is a member of the class of tryptamines that is serotonin in which one of the hydrogens attached to the primary amino group is replaced by a methyl group. It has a role as a plant metabolite and a human metabolite. It is a member of phenols and a member of tryptamines. It derives from a serotonin.\\nThe corresponding SMILES representation is:\\nCNCCC1=CNC2=C1C=C(C=C2)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the D-enantiomer of erythrose. It has a role as a plant metabolite. It is an enantiomer of a L-erythrose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@H](C=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28276',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a carbohydrate acid anion that is the conjugate base of D-galactaro-1,5-lactone, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a D-galactaro-1,5-lactone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@@H]1([C@H]([C@H](OC(=O)[C@@H]1O)C(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22781',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a flavonoid oxoanion that is the conjugate base of (S)-naringenin, arising from selective deprotonation of the 7-hydroxy group; major species at pH 7.3. It is a conjugate base of a (S)-naringenin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H](OC2=CC(=CC(=C2C1=O)[O-])O)C3=CC=C(C=C3)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21609',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a one-carbon compound in which the carbon is joined only to a single oxygen. It is a colourless, odourless, tasteless, toxic gas. It has a role as a neurotoxin, a signalling molecule, a vasodilator agent, a neurotransmitter, a metabolite, a P450 inhibitor, a ligand, a biomarker, a probe, a human metabolite, a mouse metabolite, an EC 1.9.3.1 (cytochrome c oxidase) inhibitor and a mitochondrial respiratory-chain inhibitor. It is a one-carbon compound, a gas molecular entity and a carbon oxide. It is a conjugate base of a carbon monoxide(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C-]#[O+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16314',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of benzimidazoles that is 2-aminobenzimidazole in which the primary amino group is substituted by a methoxycarbonyl group. A fungicide, carbendazim controls Ascomycetes, Fungi Imperfecti, and Basidiomycetes on a wide variety of crops, including bananas, cereals, cotton, fruits, grapes, mushrooms, ornamentals, peanuts, sugarbeet, soybeans, tobacco, and vegetables. It has a role as an antinematodal drug, a metabolite, a microtubule-destabilising agent and an antifungal agrochemical. It is a carbamate ester, a member of benzimidazoles, a benzimidazole fungicide and a benzimidazolylcarbamate fungicide. It derives from a 2-aminobenzimidazole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC(=O)NC1=NC2=CC=CC=C2N1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_443',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a fatty acid-taurine conjugate obtained by deprotonation of the sulfonate group of N-hexadecanoyltaurine; major species at pH 7.3. It is a conjugate base of a N-hexadecanoyltaurine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)NCCS(=O)(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15044',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 1,2-diacyl-sn-glycero-3-phosphate(2-) obtained by deprotonation of the phosphate OH groups of 1-heptadecanoyl-2-arachidonoyl-sn-glycero-3-phosphate; major species at pH 7.3. It is a conjugate base of a 1-heptadecanoyl-2-arachidonoyl-sn-glycero-3-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])[O-])OC(=O)CCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1147',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a geranylgeranyl diphosphate. It has a role as a human metabolite and a mouse metabolite. It is a conjugate acid of a 2-cis,6-trans,10-trans-geranylgeranyl diphosphate(3-).\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\COP(=O)(O)OP(=O)(O)O)/C)/C)/C)C\\nThe natural language question is: The molecule is an N-acyl-amino acid obtained by formal condensation of the carboxy group of 6-hydroxyindole-3-acetic acid with the amino group of phenylalanine. It has a role as an Arabidopsis thaliana metabolite. It is a phenylalanine derivative, a member of hydroxyindoles, a N-acyl-amino acid and a secondary carboxamide. It derives from an indole-3-acetic acid.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)CC(C(=O)O)NC(=O)CC2=CNC3=C2C=CC(=C3)O\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (9Z,12Z)-pentadecadienoic acid. It is a long-chain fatty acyl-CoA and an unsaturated fatty acyl-CoA. It is a conjugate acid of a (9Z,12Z)-pentadecadienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is an acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of nonadecanoyl-CoA. It is a saturated fatty acyl-CoA(4-) and a long-chain fatty acyl-CoA(4-). It is a conjugate base of a nonadecanoyl-CoA.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a bile acid anion that is the conjugate base of ursodeoxycholic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It has a role as a human metabolite. It is a bile acid anion and a cholanic acid anion. It is a conjugate base of an ursodeoxycholic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)[O-])[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25075',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a 4-[3-(4-tert-butylphenyl)-2-methylpropyl]-2,6-dimethylmorpholine in which the methyl substituents on the morpholine ring are in a cis relationship to each other and in which the remaining stereocentre has R configuration. It is an enantiomer of a (S)-fenpropimorph.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1CN(C[C@@H](O1)C)C[C@H](C)CC2=CC=C(C=C2)C(C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27163',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a member of the class of chromenes that is 2H-1-benzopyran-7-ol acetate substituted by methyl groups at positions 2 and 2, an ethyl group at position 4 and a 4-(acetyloxy)phenyl group at position 3 respectively. It is a member of chromenes, an acetate ester and a diester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=C(C(OC2=C1C=CC(=C2)OC(=O)C)(C)C)C3=CC=C(C=C3)OC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18446',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an ether compound having fluoromethyl and 1,1,1,3,3,3-hexafluoroisopropyl as the two alkyl groups. It has a role as an inhalation anaesthetic, a platelet aggregation inhibitor and a central nervous system depressant. It is an organofluorine compound and an ether. It derives from a 2-methoxypropane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(OC(C(F)(F)F)C(F)(F)F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13790',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a monocarboxylic acid anion that is the is conjugate base of 4-O-beta-D-glucosyl-4-coumaric acid. It derives from a 4-coumarate. It is a conjugate base of a 4-O-beta-D-glucosyl-4-coumaric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C/C(=O)[O-])O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2085',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a branched amino pentasaccharide consisting of beta-galactosyl-(1->4)-N-acetyl-beta-D-glucosaminyl-(1->3)-D-galactose in which the galactosyl residue at the non-reducing end has alpha-sialyl and N-acetyl-beta-D-glucosaminyl residues attached via glycosidic linkages at positions 3 and 4 respectively. Corresponds to the Sda + pentasaccharide from Tamm-Horsfall glycoprotein. It is an amino pentasaccharide, a glucosamine oligosaccharide and a galactosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H](C[C@@](O[C@H]1[C@@H]([C@@H](CO)O)O)(C(=O)O)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)NC(=O)C)CO)O[C@@H]4[C@H](O[C@H]([C@@H]([C@H]4O)NC(=O)C)O[C@H]5[C@H]([C@H](OC([C@@H]5O)O)CO)O)CO)O)O\\nThe natural language question is: The molecule is a nucleoside 5'-monophosphate(2-) that results from the removal of two protons from the phosphate group of adenosine 5'-monophosphate (AMP). It has a role as a human metabolite, a fundamental metabolite and a cofactor. It is a conjugate base of an adenosine 5'-monophosphate.\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])[O-])O)O)N\\nThe natural language question is: The molecule is a divalent inorganic anion obtained by removal of both protons from hydrogen sulfide. It is a conjugate base of a hydrosulfide.\\nThe corresponding SMILES representation is:\\n[S-2]\\nThe natural language question is: The molecule is a broad spectrum, third-generation cephalosporin antibiotic with (Z)-2-(4-methyl-1,3-thiazol-5-yl)ethenyl and (2Z)-2-(2-amino-1,3-thiazol-4-yl)-2-(methoxyimino)acetamido groups at positions 3 and 7, respectively, of the cephem skeleton. Generally administered as its orally absorbed pivaloyloxymethyl ester prodrug, it is used for the treatment of mild to moderate infections caused by susceptible strains of microorganisms in acute bacterial exacerbation of chronic bronchitis, community-acquired pneumonia, pharyngitis/tonsillitis, and uncomplicated skin and skin-structure infections. It has a role as an antibacterial drug. It is a cephalosporin and a carboxylic acid.\\nThe corresponding SMILES representation is:\\nCC1=C(SC=N1)/C=C\\\\\\\\C2=C(N3[C@@H]([C@@H](C3=O)NC(=O)/C(=N\\\\\\\\OC)/C4=CSC(=N4)N)SC2)C(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a UDP-sugar having alpha-xylose as the sugar component. It is an important metabolite in the nucleotide sugar metabolism in animals, plants, fungi, and bacteria. It has a role as a fundamental metabolite. It is a conjugate acid of an UDP-alpha-D-xylose(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@H]([C@@H]([C@H]([C@H](O1)OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11973',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organic sodium salt comprising equimolar amounts of chlorfenac(1-) anions and sodium cations. An obsolete herbicide. It has a role as an agrochemical, a herbicide and a synthetic auxin. It contains a chlorfenac(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C(=C1Cl)CC(=O)[O-])Cl)Cl.[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9956',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of anthocyanin chlorides that has pelargonidin 3-O-beta-D-glucoside as the cationic counterpart. It contains a pelargonidin 3-O-beta-D-glucoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C2=[O+]C3=CC(=CC(=C3C=C2O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O)O)O.[Cl-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7340',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a flavonolignan isolated from Silybum marianum and has been shown to exhibit inhibitory activities against lipoxygenase and prostaglandin synthetase. It has a role as a radical scavenger, a lipoxygenase inhibitor, a prostaglandin antagonist and a metabolite. It is a flavonolignan, a member of 1-benzofurans, a polyphenol, an aromatic ether and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)[C@H]2[C@@H](C3=C(O2)C(=CC(=C3)[C@@H]4[C@H](C(=O)C5=C(C=C(C=C5O4)O)O)O)O)CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18642',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a CDP-diacylglycerol(2-) obtained by deprotonation of the diphosphate OH groups of CDP-1,2-diarachidonoyl-sn-glycerol; major species at pH 7.3. It is a conjugate base of a CDP-1,2-diarachidonoyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)OC[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=CC(=NC2=O)N)O)O)OC(=O)CCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nThe natural language question is: The molecule is an octadecatrienoyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (2E,9Z,12Z)-octadecatrienoic acid. It is a trans-2-enoyl-CoA and an octadecatrienoyl-CoA. It is a conjugate acid of a (2E,9Z,12Z)-octadecatrienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is a phenylpropanoid that is one of the main monolignols, produced by the reduction of the carboxy functional group in cinnamic acid and the addition of a hydroxy and a methoxy substituent to the aromatic ring. It has a role as a monolignol, a mouse metabolite and a volatile oil component. It is a phenylpropanoid and a member of guaiacols. It derives from an (E)-cinnamyl alcohol.\\nThe corresponding SMILES representation is:\\nCOC1=C(C=CC(=C1)/C=C/CO)O\\nThe natural language question is: The molecule is a hydroxy fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 2-hydroxyhexanoic acid. It is a hydroxy fatty acyl-CoA and a medium-chain fatty acyl-CoA. It derives from a 2-hydroxyhexanoic acid. It is a conjugate acid of a 2-hydroxyhexanoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCCCCC(C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a dicarboximide that is 4-(2-hydroxyethyl)piperidine-2,6-dione in which one of the hydrogens attached to the carbon bearing the hydroxy group is replaced by a 3,5-dimethyl-2-oxocyclohexyl group. It is an antibiotic produced by the bacterium Streptomyces griseus. It has a role as a bacterial metabolite, a protein synthesis inhibitor, a neuroprotective agent and an anticoronaviral agent. It is a member of piperidones, a piperidine antibiotic, an antibiotic fungicide, a dicarboximide, a secondary alcohol and a cyclic ketone. It derives from a piperidine-2,6-dione.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C[C@@H](C(=O)[C@@H](C1)[C@@H](CC2CC(=O)NC(=O)C2)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26184',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a branched amino trisaccharide consisting of N-acetyl-beta-D-galactosamine having beta-D-glucuronosyl and N-acetyl-beta-D-galactosaminyl residues attached at the 3- and 6-positions respectively. It is an amino trisaccharide and a galactosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@H]1OC[C@@H]2[C@@H]([C@@H]([C@H]([C@@H](O2)O)NC(=O)C)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)O)O)O)O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14443',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a 3',5'-cyclic purine nucleotide in which the purine nucleobase is specified as guanidine. It has a role as a plant metabolite, a human metabolite, a Saccharomyces cerevisiae metabolite, an Escherichia coli metabolite and a mouse metabolite. It is a guanyl ribonucleotide and a 3',5'-cyclic purine nucleotide. It is a conjugate acid of a 3',5'-cyclic GMP(1-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C3N=C(NC4=O)N)O)OP(=O)(O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3757',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an amino nonasaccharide comprising a sequence of alpha-sialyl, beta-D-galactosyl, N-acetyl-beta-D-glucosaminyl, beta-D-galactosyl, N-acetyl-beta-D-glucosaminyl, beta-D-galactosyl and N-acetyl-beta-D-glucosamine residues connected by (2->6), (1->4), (1->3), (1->4), (1->3), (1->4) and (1->3) linkages respectively, to the reducing-end and proximal N-acetyl-beta-D-glucosaminyl residues of which are also (1->3)-linked alpha-L-fucosyl residues. It has a role as an epitope. It is an amino nonasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O[C@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO[C@@]8(C[C@@H]([C@H]([C@@H](O8)[C@@H]([C@@H](CO)O)O)NC(=O)C)O)C(=O)O)O)O)O)O)NC(=O)C)O)O[C@H]9[C@H]([C@@H]([C@@H]([C@@H](O9)C)O)O)O)NC(=O)C)O)CO)O)NC(=O)C)O)O)O\\nThe natural language question is: The molecule is a trivalent inorganic anion obtained by removal of all three protons from cyclotriphosphoric acid.; major species at pH 7.3. It is a triphosphate ion and a trivalent inorganic anion. It is a conjugate base of a cyclotriphosphoric acid.\\nThe corresponding SMILES representation is:\\n[O-]P1(=O)OP(=O)(OP(=O)(O1)[O-])[O-]\\nThe natural language question is: The molecule is a member of 1-benzofurans, a member of guaiacols, a guaiacyl lignin and a primary alcohol. It is an enantiomer of a (2S,3R)-dihydrodehydrodiconiferyl alcohol.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=CC2=C1O[C@H]([C@@H]2CO)C3=CC(=C(C=C3)O)OC)CCCO\\nThe natural language question is: The molecule is a limonoid with a phragmalin skeleton isolated from the leaves of Trichilia connaroides. It has a role as a plant metabolite. It is a limonoid, a delta-lactone, a bridged compound, a member of furans, an organic heteropentacyclic compound, an acetate ester, a methyl ester and an enoate ester. It derives from a tiglic acid.\\nThe corresponding SMILES representation is:\\nC/C=C(\\\\\\\\C)/C(=O)O[C@H]1[C@@]2(C[C@@]3([C@]1(CC4=C([C@@]3([C@H]2CC(=O)OC)C)CC[C@@]5(C4=C(C(=O)O[C@H]5C6=COC=C6)O)C)OC(=O)C)OC(=O)C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a dioxo monocarboxylic acid. It derives from a hexanoic acid. It is a conjugate acid of a 3-deoxy-D-glycero-hexo-2,5-diulosonate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H](C(=O)CO)O)C(=O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_224',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of thiazolidenediones that is 1,3-thiazolidine-2,4-dione substituted by a benzyl group at position 5 which in turn is substituted by a 2-(5-ethylpyridin-2-yl)ethoxy group at position 4 of the phenyl ring. It exhibits hypoglycemic activity. It has a role as an insulin-sensitizing drug, an EC 2.7.1.33 (pantothenate kinase) inhibitor and a xenobiotic. It is a member of thiazolidinediones, an aromatic ether and a member of pyridines.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC1=CN=C(C=C1)CCOC2=CC=C(C=C2)CC3C(=O)NC(=O)S3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12698',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a benzamide that is 4-aminobenzamide substituted on the amide N by a 2-(diethylamino)ethyl group. It is a pharmaceutical antiarrhythmic agent used for the medical treatment of cardiac arrhythmias. It has a role as a sodium channel blocker, an anti-arrhythmia drug and a platelet aggregation inhibitor.\\nThe corresponding SMILES representation is:\\nCCN(CC)CCNC(=O)C1=CC=C(C=C1)N\\nThe natural language question is: The molecule is a glucosamine phosphate. It has a role as an Escherichia coli metabolite. It derives from an alpha-D-glucosamine. It is a conjugate acid of an alpha-D-glucosamine 1-phosphate(1-).\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OP(=O)(O)O)N)O)O)O\\nThe natural language question is: The molecule is a linear amino tetrasaccharide consisting of two galactosyl, one glucosamine, two fucose and one glucose residue (at the reducing end) linked as shown. It is an amino hexasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H](O[C@H]([C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)NC(=O)C)O[C@H]4[C@H]([C@H](O[C@H]([C@@H]4O)O[C@@H]5[C@H](OC([C@@H]([C@H]5O[C@H]6[C@H]([C@@H]([C@@H]([C@@H](O6)C)O)O)O)O)O)CO)CO)O)CO)O)O)O\\nThe natural language question is: The molecule is a member of the class of phenols that is p-cresol in which one of the methyl hydrogens has been replaced by a methoxy group. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\nCOCC1=CC=C(C=C1)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an amino tetrasaccharide comprising a linear chain of three alpha-D-mannose and one N-acetyl-alpha-D-glucosamine resides joined in sequence by (1->2), (1->6) and (1->2) glycosidic linkages.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O[C@H]2[C@H]([C@@H]([C@H](O[C@@H]2OC[C@@H]3[C@H]([C@@H]([C@@H]([C@H](O3)O[C@@H]4[C@H](O[C@@H]([C@@H]([C@H]4O)N)O)CO)O)O)O)CO)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26125',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of guanidines that is tetrahydropyrimidin-2(1H)-imine in which the hydrogen of the imino group is replaced by a thiazol-2-yl group which in turn is substituted by a 2-(2,4-dimethylphenoxy)phenyl group at position 4. It has been used for the topical treatment of fungal nail infections. It has a role as an antifungal drug. It is a member of 1,3-thiazoles, an aromatic ether and a member of guanidines.\\nThe corresponding SMILES representation is:\\nCC1=CC(=C(C=C1)OC2=CC=CC=C2C3=CSC(=N3)NC4=NCCCN4)C\\nThe natural language question is: The molecule is an oxaspiro compound that is 1-oxaspiro[4.5]deca-6,9-dien-8-one substituted by a (2R)-2-hydroxyheptadecyl moiety and a methoxy group at position 2. It is isolated from the leaves of Amomum aculeatum and exhibits anticancerous efficacy against human lung carcinoma, hormone-dependent human prostate carcinoma and human breast carcinoma. It has a role as an antineoplastic agent and a plant metabolite. It is an oxaspiro compound, a cyclic ketal, an enone and a secondary alcohol.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCC[C@H](CC1(CCC2(O1)C=CC(=O)C=C2)OC)O\\nThe natural language question is: The molecule is a beta-D-glucosyl-(1->4)-N-acetyl-D-glucosaminyl undecaprenyl diphosphate(2-) in which the anomeric centre connected to the diphosphate group has alpha-configuration. It is a conjugate base of a beta-D-glucosyl-(1->4)-N-acetyl-alpha-D-glucosaminyl undecaprenyl diphosphate.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])OP(=O)([O-])O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C\\nThe natural language question is: The molecule is a peptide zwitterion obtained by transfer of a proton from the carboxy to the amino terminus of Ala-Gly. It is a tautomer of an Ala-Gly.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)NCC(=O)[O-])[NH3+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a trimethoxyflavone that is 6-hydroxyluteolin in which the phenolic hydogens at positions 4', 6 and 7 have been replaced by methyl groups. It has a role as a Brassica napus metabolite, an apoptosis inducer, a vasodilator agent, a calcium channel blocker, an anti-inflammatory agent, a P450 inhibitor and an antineoplastic agent. It is a dihydroxyflavone, a trimethoxyflavone and a polyphenol. It derives from a 6-hydroxyluteolin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=C(C=C(C=C1)C2=CC(=O)C3=C(C(=C(C=C3O2)OC)OC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15353',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is conjugate base of 20-hydroxy-leukotriene E4. It has a role as a human metabolite. It is a conjugate base of a 20-hydroxy-leukotriene E4.\\nThe corresponding SMILES representation is:\\nC(CC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C=C\\\\\\\\[C@H]([C@H](CCCC(=O)[O-])O)SC[C@@H](C(=O)[O-])[NH3+])CCO\\nThe natural language question is: The molecule is a cytochalasan alkaloid isolated from Chaetomium globosum and Calonectria morganii. It has a role as a Chaetomium metabolite. It is a cytochalasan alkaloid, a member of indoles, a macrocycle, an epoxide and a secondary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\nC[C@H]\\\\\\\\1C/C=C/[C@H]2C3[C@](O3)([C@H]([C@@H]4[C@@]2(C(=O)/C=C/C(=O)[C@@H](/C(=C1)/C)O)C(=O)N[C@H]4CC5=CNC6=CC=CC=C65)C)C\\nThe natural language question is: The molecule is conjugate base of 5,12-dihydroxanthommatin having anionic carboxy groups and a protonated primary amino group. It is a conjugate base of a 5,12-dihydroxanthommatin.\\nThe corresponding SMILES representation is:\\nC1=CC(=C2C(=C1)OC3=C(N2)C4=C(C(=C3)[O-])NC(=CC4=O)C(=O)O)C(=O)CC(C(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is a hydrobromide obtained by combining (S)-SKF 38393 with one molar equivalent of hydrogen bromide. It contains a (S)-SKF 38393(1+). It is an enantiomer of a (R)-SKF 38393 hydrobromide.\\nThe corresponding SMILES representation is:\\nC1CNC[C@H](C2=CC(=C(C=C21)O)O)C3=CC=CC=C3.Br\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 3-nitrotyrosine comprising D-tyrosine having a nitro group at the 3-position on the phenyl ring. It is a 3-nitrotyrosine, a D-tyrosine derivative and a D-alpha-amino acid. It is an enantiomer of a 3-nitro-L-tyrosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1C[C@H](C(=O)O)N)[N+](=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23422',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a polyprenyl phospho oligosaccharide that consists of an alpha-L-Fuc-(1->2)-beta-D-Gal-(1->3)-alpha-D-GalNAc-(1->3)-alpha-D-GalNAc moiety linked via a diphospho group to ditrans,octacis-undecaprenol. It is a conjugate acid of an alpha-L-Fuc-(1->2)-beta-D-Gal-(1->3)-alpha-D-GalNAc-(1->3)-alpha-D-GalNAc-diphospho-ditrans,octacis-undecaprenol(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H]([C@H]([C@H](O[C@H]2O[C@@H]3[C@H]([C@H](O[C@@H]([C@@H]3O)CO)O[C@@H]4[C@H]([C@H](O[C@@H]([C@@H]4O)CO)OP(=O)(O)OP(=O)(O)OC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(/C)\\\\\\\\CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)NC(=O)C)NC(=O)C)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23450',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a phthalate ester that is the diester obtained by the formal condensation of the carboxy groups of phthalic acid with two molecules of butan-1-ol. Although used extensively as a plasticiser, it is a ubiquitous environmental contaminant that poses a risk to humans. It has a role as an environmental contaminant, a teratogenic agent, a plasticiser, a metabolite and an EC 3.2.1.20 (alpha-glucosidase) inhibitor. It is a phthalate ester and a diester. It derives from a butan-1-ol.\\nThe corresponding SMILES representation is:\\nCCCCOC(=O)C1=CC=CC=C1C(=O)OCCCC\\nThe natural language question is: The molecule is an optically active form of 3-amino-3-phenylpropanoic acid having S-configuration. It is an enantiomer of a (R)-3-amino-3-phenylpropanoic acid. It is a tautomer of a (S)-3-ammonio-3-phenylpropanoate.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)[C@H](CC(=O)O)N\\nThe natural language question is: The molecule is a monomethoxyflavone that is myricetin in which the hydroxy group at position 7 is substituted by a methoxy group. It has a role as a plant metabolite. It is a pentahydroxyflavone and a monomethoxyflavone. It derives from a myricetin.\\nThe corresponding SMILES representation is:\\nCOC1=CC(=C2C(=C1)OC(=C(C2=O)O)C3=CC(=C(C(=C3)O)O)O)O\\nThe natural language question is: The molecule is an N-acylserotonin obtained by formal condensation of the carboxy group of hexadecanoic acid with the primary amino group of serotonin. It derives from a hexadecanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)NCCC1=CNC2=C1C=C(C=C2)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of 1,4-benzoquinones that is 1,4-benzoquinone which has been substituted by a methoxy group at position 5 and a 3-(4-hydroxy-3-methoxyphenyl)propyl group at position 2. It has been isolated from the stems of Combretum griffithii and has been shown to exhibit anticancer activity. It has a role as a metabolite, an antineoplastic agent and a plant metabolite. It is a member of phenols, a monomethoxybenzene and a member of 1,4-benzoquinones.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=O)C(=CC1=O)CCCC2=CC(=C(C=C2)O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18291',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a disaccharide phosphate that is N-acetylbeta-D-glucosamine having a 6-O-phospho-beta-D-galactosyl residue attached at O-4. It is an amino disaccharide, a disaccharide phosphate and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)COP(=O)(O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6343',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of pyrazoles that is antipyrine substituted at C-4 by a methylamino group. It is a metabolite of aminopyrine and of metamizole. It has a role as a non-narcotic analgesic, an opioid analgesic, a non-steroidal anti-inflammatory drug, an EC 1.14.99.1 (prostaglandin-endoperoxide synthase) inhibitor, a peripheral nervous system drug, an antipyretic and a drug metabolite. It is a member of pyrazoles and a secondary amino compound. It derives from an antipyrine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(=O)N(N1C)C2=CC=CC=C2)NC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12468',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a carbohydrate acid anion that is the conjugate base of D-galactaro-1,5-lactone, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a D-galactaro-1,5-lactone.\\nThe corresponding SMILES representation is:\\n[C@@H]1([C@H]([C@H](OC(=O)[C@@H]1O)C(=O)[O-])O)O\\nThe natural language question is: The molecule is a member of the class of beta-carbolines that is 9H-beta-carboline substituted by a ethyl group at position 9, methoxy group at position 7 and a methyl group at position 1. It is semisynthetic derivative of harmine and has been shown to exhibit significant anti-HIV activity. It has a role as an anti-HIV agent. It is an aromatic ether, a semisynthetic derivative and a member of beta-carbolines. It derives from a harmine.\\nThe corresponding SMILES representation is:\\nCCN1C2=C(C=CC(=C2)OC)C3=C1C(=NC=C3)C\\nThe natural language question is: The molecule is an olefinic compound that octan-2-ol carrying a double bond at position 7. It has a role as a metabolite. It is a secondary alcohol and an olefinic compound.\\nThe corresponding SMILES representation is:\\nCC(CCCCC=C)O\\nThe natural language question is: The molecule is an oligopeptide comprising of nine amino acids with sequence L-Ile-L-Ala-L-Arg-L-Arg-L-His-L-Pro-L-Tyr-L-Phe-L-Leu. It was originally isolated from pepsin-treated human plasma and shares some sequence homology with the C-terminal end of neurotensin. It is a potent histamine releaser and may serve as an inflammatory mediator. It has a role as a human metabolite and a histamine releasing agent. It is a conjugate base of a kinetensin(2+).\\nThe corresponding SMILES representation is:\\nCC[C@H](C)[C@@H](C(=O)N[C@@H](C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CC1=CN=CN1)C(=O)N2CCC[C@H]2C(=O)N[C@@H](CC3=CC=C(C=C3)O)C(=O)N[C@@H](CC4=CC=CC=C4)C(=O)N[C@@H](CC(C)C)C(=O)O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of bipyridines that is 4,4'-bipyridine substituted at positions 2 and 2' by 4-fluoroaminophenyl and 4-methoxycyclohexanecarboxamido groups respectively. It is a member of bipyridines, a monocarboxylic acid amide, an ether, an organofluorine compound, an aromatic amine and a secondary amino compound.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1CCC(CC1)C(=O)NC2=NC=CC(=C2)C3=CC(=NC=C3)NC4=CC=C(C=C4)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8494',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a hydroxy-amino acid that is aspartic acid in which one of the methylene hydrogens has been replaced by a hydroxy group. It is a hydroxy-amino acid, an amino dicarboxylic acid, a C4-dicarboxylic acid and an aspartic acid derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(C(=O)O)O)(C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10044',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a peptide zwitterion obtained by transfer of two protons from the carboxy to the amino groups of Cys-Gly disulfide; major species at pH 7.3. It is a tautomer of a Cys-Gly disulfide.\\nThe corresponding SMILES representation is:\\nC(C(C(=O)NCC(=O)[O-])[NH3+])SSCC(C(=O)NCC(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is a member of the class of 2-benzofurans that is 2-benzofuran-1(3H)-one substituted by hydroxy groups at positions 5 and 6, a methyl group at position 7 and a propylidene group at position 3. It has been isolated from Penicillium purpurogenum. It has a role as a metabolite and a Penicillium metabolite. It is a gamma-lactone, a member of 2-benzofurans and a member of catechols.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\1/C2=CC(=C(C(=C2C(=O)O1)C)O)O\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (2E,12Z,15Z,18Z,21Z,24Z,27Z)-triacontaheptaenoic acid. It is an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It is a conjugate acid of a (2E,12Z,15Z,18Z,21Z,24Z,27Z)-triacontaheptaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is an aromatic alcohol that is (but-3-en-1-yl)benzene in which one of the benzylic methylene hydrogens has been replaced by a hydroxy group. It is a secondary alcohol, an aromatic alcohol and an olefinic compound.\\nThe corresponding SMILES representation is:\\nC=CCC(C1=CC=CC=C1)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of chromones that is chromone substituted by hydroxy groups at positions 5 and 7 and a methoxy groups at position 6. It has been isolated from Pisonia aculeata. It has a role as a plant metabolite. It is an aromatic ether, a member of chromones and a member of resorcinols. It derives from a chromone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C2=C(C=C1O)OC=CC2=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22051',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a (2R,3S)-2-aminooctadec-4-ene-1,3-diol in which the double bond has E geochemistry. It is a conjugate base of a L-erythro-sphingosine(1+). It is an enantiomer of a sphingosine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCC/C=C/[C@@H]([C@@H](CO)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2227',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is hexaanion of 1-(5-phospho-D-ribosyl)-ATP. It has a role as a Saccharomyces cerevisiae metabolite. It is a conjugate base of a 1-(5-phospho-D-ribosyl)-ATP.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])O)O)N=CN(C2=N)C4[C@@H]([C@@H]([C@H](O4)COP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24030',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a diterpenyl phosphate that is the O-diphospho derivative of (+)-kolavenol It is a diterpenyl phosphate and a member of octahydronaphthalenes. It is a conjugate acid of a (+)-kolavenyl diphosphate(3-). It is an enantiomer of a (-)-kolavenyl diphosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1CC[C@]2([C@H]([C@]1(C)CC/C(=C/COP(=O)(O)OP(=O)(O)O)/C)CCC=C2C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1816',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a palmitate ester resulting from the formal condensation of palmitic acid with dodecan-1-ol. It is A wax ester synthesised by retinal pigment epithelial membranes. It has a role as a metabolite. It is a wax ester and a hexadecanoate ester. It derives from a dodecan-1-ol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OCCCCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28631',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a NTA and a tricarboxylic acid monoanion. It is a conjugate base of a nitrilotriacetic acid. It is a conjugate acid of a nitrilotriacetate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(=O)O)N(CC(=O)O)CC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24166',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an aflatoxin B1 compound formed via enzymic epoxidation of aflatoxin B1 followed by non-enzymic hydrolysis. It derives from an aflatoxin B1.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C2C3=C(C(=O)CC3)C(=O)OC2=C4[C@H]5[C@H](OC(C5O)O)OC4=C1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1775',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a diterpene alkaloid with formula C25H41NO9 that is isolated from several Aconitum species. It has a role as a plant metabolite, a human urinary metabolite, a NF-kappaB inhibitor and a xenobiotic. It is a bridged compound, a diterpene alkaloid, an organic heteropolycyclic compound, a polyether, a tertiary amino compound, a pentol, a secondary alcohol and a tertiary alcohol. It derives from a hydride of an aconitane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN1C[C@@]2([C@@H](C[C@@H]([C@@]34[C@@H]2[C@H]([C@@H](C31)[C@@]5([C@@H]6[C@H]4C[C@@]([C@@H]6O)([C@H]([C@@H]5O)OC)O)O)OC)OC)O)COC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21346',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pregnane-based steroidal hormone produced by the outer-section (zona glomerulosa) of the adrenal cortex in the adrenal gland, and acts on the distal tubules and collecting ducts of the kidney to cause the conservation of sodium, secretion of potassium, increased water retention, and increased blood pressure. The overall effect of aldosterone is to increase reabsorption of ions and water in the kidney. It has a role as a human metabolite and a mouse metabolite. It is an 11beta-hydroxy steroid, a 21-hydroxy steroid, a 18-oxo steroid, a 20-oxo steroid, a C21-steroid hormone, a steroid aldehyde, a 3-oxo-Delta(4) steroid, a primary alpha-hydroxy ketone and a mineralocorticoid. It derives from a hydride of a pregnane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2[C@H](C[C@]4([C@H]3CC[C@@H]4C(=O)CO)C=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7578',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a polyene antibiotic that is TMC-1A in which the 2,4-dimethyloct-2-enoyl group has been replaced by an (E)-4,6-dimethyldec-2-enoyl group. TMC-1D is an antitumour antibiotic isolated from Streptomyces sp. A-230. It has a role as an antineoplastic agent and a bacterial metabolite. It is a cyclic ketone, an enol, a polyene antibiotic, a secondary alcohol, a tertiary alcohol, an enone, an enamide and a secondary carboxamide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC(C)CC(C)/C=C/C(=O)NC1=C[C@]([C@@H](CC1=O)O)(/C=C/C=C/C=C/C(=O)NC2=C(CCC2=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26581',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a carbodiimide having cyclcohexyl and 2-(4-morpholinyl)ethyl as the two N-substituents. It has a role as a cross-linking reagent. It is a carbodiimide and a member of morpholines.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CCC(CC1)N=C=NCCN2CCOCC2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2920',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of 1-benzofurans, a member of guaiacols, a guaiacyl lignin and a primary alcohol. It is an enantiomer of a (2R,3S)-dihydrodehydrodiconiferyl alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=CC2=C1O[C@@H]([C@H]2CO)C3=CC(=C(C=C3)O)OC)CCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23337',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a monocarboxylic acid that is nonanoic acid in which one of the methyl hydrogens at position 9 has been replaced by a methylsulfinyl group. It has a role as a plant metabolite. It is a sulfoxide and a monocarboxylic acid. It derives from a nonanoic acid.\\nThe corresponding SMILES representation is:\\nCS(=O)CCCCCCCCC(=O)O\\nThe natural language question is: The molecule is an omega-hydroxy fatty acid ascaroside obtained by formal condensation of the alcoholic hydroxy group of 11-hydroxyundecanoic acid with ascarylopyranose (the alpha anomer). It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a monocarboxylic acid and an omega-hydroxy fatty acid ascaroside. It derives from an 11-hydroxyundecanoic acid. It is a conjugate acid of an oscr#18(1-).\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCC(=O)O)O)O\\nThe natural language question is: The molecule is a dipeptide derivative that consists of arginylaspartamide having a 2-[(5-sulfo-1-naphthyl)amino]ethyl group attached to the amide nitrogen at the carboxy terminus and a [2-(2-{[4-(indol-3-yl)butanoyl]amino}ethoxy)ethoxy]acetyl group attached to the amino terminus. It is a dipeptide, a member of indoles and an arenesulfonic acid.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C(=CN2)CCCC(=O)NCCOCCOCC(=O)NC(CCCN=C(N)N)C(=O)NC(CC(=O)N)C(=O)NCCNC3=CC=CC4=C3C=CC=C4S(=O)(=O)O\\nThe natural language question is: The molecule is a phosphatidylcholine O-40:2 in which the alkyl and acyl groups at positions 1 and 2 are octadecyl and (13Z,16Z)-docosadienoyl respectively. It is a phosphatidylcholine O-40:2 and a 2-acyl-1-alkyl-sn-glycero-3-phosphocholine. It derives from a (13Z,16Z)-docosadienoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCOC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCCCCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the diphosphate OH groups of 7-methyl-7,8-dihydroguanosine-5'-diphosphate. It is a conjugate base of a 7-methyl-7,8-dihydroguanosine-5'-diphosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CN1CN(C2=C1C(=O)NC(=N2)N)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11852',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a germacranolide with anthelminthic, antiparasitic and antiviral activities. It has a role as an anthelminthic drug, an antiinfective agent, an antineoplastic agent, an antiparasitic agent, an antiviral drug and a metabolite. It is a germacranolide and a heterobicyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C/1=C\\\\\\\\CC/C(=C/[C@@H]2[C@@H](CC1)C(=C)C(=O)O2)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12853',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a lipid A derivative having an alpha-D-Glc-(1->3)-L-alpha-D-Hep-(1->3)-L-alpha-D-Hep-(1->5)-[alpha-Kdo-(2->4)]-alpha-Kdo moiety attached to the free primary hydroxy group of lipid A. It is a member of lipid As, a dodecanoate ester and a tetradecanoate ester. It is a conjugate acid of a glucosyl-(heptosyl)2-(KDO)2-lipid A(6-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCC(=O)O[C@H](CCCCCCCCCCC)CC(=O)O[C@@H]1[C@H]([C@@H](O[C@@H]([C@H]1OP(=O)(O)O)CO[C@@]2(C[C@H]([C@H]([C@H](O2)[C@@H](CO)O)O[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)[C@H](CO)O)O)O[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)[C@H](CO)O)O)O[C@@H]5[C@@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O)O)O)O[C@@]6(C[C@H]([C@H]([C@H](O6)[C@@H](CO)O)O)O)C(=O)O)C(=O)O)OC[C@@H]7[C@H]([C@@H]([C@H]([C@H](O7)OP(=O)(O)O)NC(=O)C[C@@H](CCCCCCCCCCC)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)O)NC(=O)C[C@@H](CCCCCCCCCCC)OC(=O)CCCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4402',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is coumarin carrying a methyl group at C-4 and a succinyl-leucyl-tryrosyl side-chain at C-7. It has a role as a peptidomimetic.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=O)OC2=C1C=CC(=C2)NC(=O)[C@H](CC3=CC=C(C=C3)O)NC(=O)[C@H](CC(C)C)NC(=O)CCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_369',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a sesquarterpene that is 1,4-cyclohexadiene bearing a methyl substituent at position 1 and a 6,10,14,18,22-pentamethyltricosa-5,9,13,17,21-pentaen-2-yl group at position 4. It is a cyclohexadiene and a sesquarterpene.\\nThe corresponding SMILES representation is:\\nCC1=CCC(=CC1)[C@H](C)CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C\\nThe natural language question is: The molecule is a 1,2-diacyl-sn-glycerol in which the acyl groups at positions 1 and 2 are specified as palmitoyl and (4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl respectively. It derives from an all-cis-docosa-4,7,10,13,16,19-hexaenoic acid and a hexadecanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCC(=O)OC[C@H](CO)OC(=O)CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CC\\nThe natural language question is: The molecule is a hydrochloride resulting from the reaction of equimolar amounts of bromhexine and hydrogen chloride. It is used as a mucolytic for the treatment of respiratory disorders associated with productive cough (i.e. a cough characterised by the production of sputum). It has a role as a mucolytic. It contains a bromhexine(1+).\\nThe corresponding SMILES representation is:\\nCN(CC1=C(C(=CC(=C1)Br)Br)N)C2CCCCC2.Cl\\nThe natural language question is: The molecule is a secondary carboxamide resulting from the formal condensation of the carboxy group of piperidine-4-carboxylic acid with the amino group of 5-{[(5-tert-butyl-1,3-oxazol-2-yl)methyl]sulfanyl}-1,3-thiazol-2-amine. It is an ATP-competitive inhibitor of CDK2, CDK7 and CDK9 kinases and exhibits anti-cancer properties. It has a role as an apoptosis inducer, an antineoplastic agent, an EC 2.7.11.22 (cyclin-dependent kinase) inhibitor and an angiogenesis inhibitor. It is a piperidinecarboxamide, a member of 1,3-oxazoles, a member of 1,3-thiazoles, an organic sulfide and a secondary carboxamide.\\nThe corresponding SMILES representation is:\\nCC(C)(C)C1=CN=C(O1)CSC2=CN=C(S2)NC(=O)C3CCNCC3\\nNext, you will be given a sample for test.The natural language question is: The molecule is an oligopeptide composed of L-alanine, L-arginine, glycine, L-tyrosine, L-serine, L-serine, L-phenylalanine, L-isoleucine, L-tyrosine, L-trytophan, L-phenylalanine, L-phenylalanine, L-aspartic acid and L-phenylalanine joined in sequence by peptide linkages.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC(C)[C@@H](C(=O)N[C@@H](CC1=CC=C(C=C1)O)C(=O)N[C@@H](CC2=CNC3=CC=CC=C32)C(=O)N[C@@H](CC4=CC=CC=C4)C(=O)N[C@@H](CC5=CC=CC=C5)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC6=CC=CC=C6)C(=O)O)NC(=O)[C@H](CC7=CC=CC=C7)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](CC8=CC=C(C=C8)O)NC(=O)CNC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](C)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1864',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an L-lysine derivative comprising L-lysine having two methyl substituents attached to the side-chain amino group. It is a L-lysine derivative and a non-proteinogenic L-alpha-amino acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN(C)CCCC[C@@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3509',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a member of the class of phenylureas that is 1-methylurea substituted by a p-cumenyl group at position 3. It is a metabolite of the herbicide isoproturon. It has a role as a marine xenobiotic metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C1=CC=C(C=C1)NC(=O)NC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15849',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of the class of phenylureas that is urea substituted by a 4-chlorophenyl group and a 3,4-dichlorophenyl group at positions 1 and 3 respectively. It has a role as a disinfectant, an antiseptic drug, an antimicrobial agent, an environmental contaminant and a xenobiotic. It is a dichlorobenzene, a member of monochlorobenzenes and a member of phenylureas. It derives from a 1,3-diphenylurea.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1NC(=O)NC2=CC(=C(C=C2)Cl)Cl)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9832',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a tricarboxylic acid that is pimelic acid carrying an additional carboxy substituent at position 3 as well as a hydroxy substituent at position 2 (the 2R,3S-diastereomer). It is a tricarboxylic acid and a secondary alcohol. It is a conjugate acid of a (-)-threo-isodihomocitrate(3-).\\nThe corresponding SMILES representation is:\\nC(C[C@@H]([C@H](C(=O)O)O)C(=O)O)CC(=O)O\\nThe natural language question is: The molecule is a carboxamide resulting from the formal condensation of the primary amino group of 5-cyclopropyl-1H-pyrazol-3-amine with the carboxy group of (2S)-2-phenylpropanoic acid in which the phenyl ring is substituted at the para position by a 2-oxopyrrolidin-1-yl group. A CDK2 inhibitor with antineoplastic activity. It has a role as an EC 2.7.11.22 (cyclin-dependent kinase) inhibitor and an antineoplastic agent. It is a member of pyrrolidin-2-ones, a member of pyrazoles, a member of cyclopropanes, a secondary carboxamide and a tertiary carboxamide.\\nThe corresponding SMILES representation is:\\nC[C@@H](C1=CC=C(C=C1)N2CCCC2=O)C(=O)NC3=NNC(=C3)C4CC4\\nThe natural language question is: The molecule is 1-(2,4-dichloro-10,11-dihydrodibenzo[a,d][7]annulen-5-yl)imidazole that is the (R)-enantiomer of eberconazole. It is a conjugate base of a (R)-eberconazole(1+). It is an enantiomer of a (S)-eberconazole.\\nThe corresponding SMILES representation is:\\nC1CC2=C([C@@H](C3=CC=CC=C31)N4C=CN=C4)C(=CC(=C2)Cl)Cl\\nThe natural language question is: The molecule is a member of the class of phenoxazines that is 1,9-dimethyl-3H-phenoxazin-3-one carrying an additional hydroxy substituent at position 7 as well as two 2,4-dihydroxy-6-methylphenyl substituents at positions 2 and 8. The isomer in which the hydroxy groups at positions 2' and 2'' on the phenyl rings are both on the same side of the plane of the phenoxazine ring system. A component of orcein, a mixture of dyes isolated from lichens. It has a role as a food colouring, a histological dye and a plant metabolite. It is a cyclic ketone, a phenoxazine, a polyphenol and a member of resorcinols.\\nThe corresponding SMILES representation is:\\nCC1=CC(=CC(=C1C2=C(C3=C(C=C2O)OC4=CC(=O)C(=C(C4=N3)C)C5=C(C=C(C=C5C)O)O)C)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a D-galactopyranuronic acid with a beta-configuration at the anomeric center. It is a conjugate acid of a beta-D-galacturonate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': '[C@@H]1([C@H]([C@H](O[C@H]([C@@H]1O)O)C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22230',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a hydroxy monocarboxylic acid anion that is the conjugate base of tetracenomycin F2, obtained by deprotonation of the carboxy group. It is a hydroxy monocarboxylic acid anion and an oxo monocarboxylic acid anion. It is a conjugate base of a tetracenomycin F2. It is a conjugate acid of a tetracenomycin F2(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)C1=C(C=C2CC3=C(C(=CC(=C3)O)O)C(=O)C2=C1O)/C=C(\\\\\\\\CC(=O)[O-])/O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4804',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a trihydroxyflavone that is flavone substituted by hydroxy groups at positions 5, 2', and 4', a methoxy group at position 7, a prenyl group at position 3 and a (1E)-3-methylbut-1-enyl group at position 6. Isolated from Artocarpus heterophyllus and Artocarpus integrifolia, it exhibits antineoplastic activity. It has a role as a metabolite and an antineoplastic agent. It is a monomethoxyflavone and a trihydroxyflavone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(C)/C=C/C1=C(C=C2C(=C1O)C(=O)C(=C(O2)C3=C(C=C(C=C3)O)O)CC=C(C)C)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1049',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an N-acylphosphatidylethanolamine in which the N-acyl group is specified as lauroyl (dodecanoyl) while the phosphatidyl acyl groups are both specified as oleoyl (9Z-octadecenoyl). It derives from an oleic acid and a dodecanoic acid. It is a conjugate acid of a N-lauroyl-1,2-dioleoyl-sn-glycero-3-phosphoethanolamine(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCC(=O)NCCOP(=O)(O)OC[C@@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2674',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a triazole that is benzonitrile substituted by a (1S,2R)-3-(4-fluorophenyl)-2-hydroxy-1-(1,2,4-triazol-1-yl)propyl group at position 4. It has a role as an epitope. It is a member of triazoles, an organofluorine compound and a secondary alcohol. It derives from a benzonitrile.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C[C@H]([C@H](C2=CC=C(C=C2)C#N)N3C=NC=N3)O)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15385',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is the L-enantiomer of gulonate. It has a role as a human metabolite and a mammalian metabolite. It is a conjugate base of a L-gulonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]([C@H]([C@@H]([C@@H](C(=O)[O-])O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1973',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a monocarboxylic acid that is acrylic acid with substituents 4-methoxyphenyl at position 2 and 6-methylpyridin-2-yl at position 3. It has a role as a metabolite. It is a monocarboxylic acid, a monomethoxybenzene and a member of pyridines. It derives from an acrylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=NC(=CC=C1)/C=C(/C2=CC=C(C=C2)OC)\\\\\\\\C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5137',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a lignan isolated from the stems of Sinocalamus affinis. It has a role as a plant metabolite. It is a lignan, a dimethoxybenzene, a polyphenol, an aromatic ketone and a primary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=CC(=CC(=C1O)OC)[C@@H]2[C@H]([C@@H](CO2)C(=O)C3=CC(=C(C(=C3)OC)O)OC)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15192',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a polyunsaturated fatty acid that is hexadecanoic acid with unsaturation at positions 4, 7, 10 and 13. It is found in Daphnia galeata. It has a role as a Daphnia galeata metabolite. It is a polyunsaturated fatty acid, a long-chain fatty acid and a straight-chain fatty acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8750',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the (R)-enantiomer of 2-hydroxy-4-methylpentanoic acid. Found in patients with short-bowel syndrome (an inborn error of metabolism), and in maple syrup urine disease, MSUD. It is a (2R)-2-hydroxy monocarboxylic acid and a 2-hydroxy-4-methylvaleric acid. It is a conjugate acid of a (R)-2-hydroxy-4-methylpentanoate. It is an enantiomer of a (S)-2-hydroxy-4-methylpentanoic acid.\\nThe corresponding SMILES representation is:\\nCC(C)C[C@H](C(=O)O)O\\nThe natural language question is: The molecule is a phosphonic acid having a 1-hydroxy-2-aminoethyl group attached to the phosphorus. It derives from a phosphonic acid. It is a tautomer of a (2-amino-1-hydroxyethyl)phosphonate zwitterion.\\nThe corresponding SMILES representation is:\\nC(C(O)P(=O)(O)O)N\\nThe natural language question is: The molecule is a 1-(phosphoribosyl)imidazole having the phospho group at the 5'-position and a carboxyamino group at the 5-position on the imidazole ring. It has a role as an Escherichia coli metabolite. It derives from a carbamic acid. It is a conjugate acid of a 5-carboxylatoamino-1-(5-O-phosphonato-D-ribosyl)imidazole(3-).\\nThe corresponding SMILES representation is:\\nC1=C(N(C=N1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O)O)O)NC(=O)O\\nThe natural language question is: The molecule is an ether that is 2-(1H-imidazol-1-yl)-1-(4-methoxyphenyl)ethanol in which the hydrogen of the hydroxy group has been substituted by a 3-(4-methoxyphenyl)propyl group. It has a role as a TRP channel blocker. It is a member of imidazoles, a monomethoxybenzene and an ether. It is a conjugate base of a SKF-96365 free base(1+).\\nThe corresponding SMILES representation is:\\nCOC1=CC=C(C=C1)CCCOC(CN2C=CN=C2)C3=CC=C(C=C3)OC\\nNext, you will be given a sample for test.The natural language question is: The molecule is a trisaccharide consisting of two beta-D-galactopyranose residues and a D-xylopyranose residue joined in sequence by (1->3) and (1->4) glycosidic bonds. It derives from a beta-(1->3)-galactobiose and a beta-D-Galp-(1->4)-D-Xylp.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@H]([C@@H]([C@H](C(O1)O)O)O)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11431',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a rutinoside consisting of cyanidin having the rutinosyl group at the 3-position. It is an anthocyanin cation, a disaccharide derivative, a member of benzenes and a rutinoside. It derives from a cyanidin cation. It is a conjugate acid of a cyanidin 3-O-rutinoside betaine.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=CC4=C(C=C(C=C4[O+]=C3C5=CC(=C(C=C5)O)O)O)O)O)O)O)O)O)O\\nThe natural language question is: The molecule is a quercetin O-glycoside that is quercetin substituted by a alpha-L-rhamnosyl moiety at position 3 via a glycosidic linkage. It has a role as an antioxidant, an antileishmanial agent, an EC 1.1.1.184 [carbonyl reductase (NADPH)] inhibitor, an EC 1.1.1.21 (aldehyde reductase) inhibitor, an EC 1.14.18.1 (tyrosinase) inhibitor and a plant metabolite. It is a monosaccharide derivative, a tetrahydroxyflavone, an alpha-L-rhamnoside and a quercetin O-glycoside. It is a conjugate acid of a quercitrin-7-olate.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC2=C(OC3=CC(=CC(=C3C2=O)O)O)C4=CC(=C(C=C4)O)O)O)O)O\\nThe natural language question is: The molecule is an oxazinane alkaloid that is methyl (4S)-1,3-oxazinane-4-carboxylate substituted by a (1H-pyrrol-2-yl group at position 2 and a 2-methylheptyl group at position 5. Isolated from Celastrus angulatus, it exhibits cytotoxic activity. It has a role as a metabolite and an antineoplastic agent. It is a member of pyrroles, an alkaloid, an oxazinane and a methyl ester.\\nThe corresponding SMILES representation is:\\nCCCCCC(C)C[C@H]1CO[C@@H](N[C@@H]1C(=O)OC)C2=CC=CN2\\nThe natural language question is: The molecule is a 1,2-benzoxazole compound having a sulfamoylmethyl substituent at the 3-position. It has a role as an anticonvulsant, an antioxidant, a central nervous system drug and a protective agent. It is a member of 1,2-benzoxazoles and a sulfonamide.\\nThe corresponding SMILES representation is:\\nC1=CC=C2C(=C1)C(=NO2)CS(=O)(=O)N\\nNext, you will be given a sample for test.The natural language question is: The molecule is a methoxyisoflavan that is (S)-isoflavan substituted by methoxy groups at positions 6 and 3' and hydroxy groups at positions 4 and 4'. It has been isolated from Taxus yunnanensis and exhibits inhibitory activity against CYP3A4. It has a role as a plant metabolite. It is a member of hydroxyisoflavans and a methoxyisoflavan. It derives from a (S)-isoflavan.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC2=C(C=C1)OC[C@@H]([C@H]2O)C3=CC(=C(C=C3)O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5233',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a diester that results from the condensation of the 1-carboxy groups of two molecules of propane-1,2,3-tricarboxylic acid with hydroxy groups at positions 14 and 15 of (2S,3S,5R,10R,12S,14S,15R,16R)-2-amino-12,16-dimethylicosane-3,5,10,14,15-pentol. It has a role as a metabolite and a carcinogenic agent. It is a fumonisin, a primary amino compound, a diester and a triol. It derives from a (2S,3S,5R,10R,12S,14S,15R,16R)-2-amino-12,16-dimethylicosane-3,5,10,14,15-pentol. It is a conjugate acid of a fumonisin B1(3-).\\nThe corresponding SMILES representation is:\\nCCCC[C@@H](C)[C@H]([C@H](C[C@@H](C)C[C@@H](CCCC[C@H](C[C@@H]([C@H](C)N)O)O)O)OC(=O)C[C@@H](CC(=O)O)C(=O)O)OC(=O)C[C@@H](CC(=O)O)C(=O)O\\nThe natural language question is: The molecule is tetraanion of 4-CDP-2-C-methyl-D-erythritol 2-phosphate arising from deprotonation of phosphate and diphosphate groups; major species at pH 7.3. It is a conjugate base of a 4-CDP-2-C-methyl-D-erythritol 2-phosphate.\\nThe corresponding SMILES representation is:\\nC[C@](CO)([C@@H](COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=CC(=NC2=O)N)O)O)O)OP(=O)([O-])[O-]\\nThe natural language question is: The molecule is an oligosaccharide derivative that is a tridecasaccharide derivative, the oligosaccharide portion of the Proteus penneri strain 25 lipopolysaccharide (LPS) core region.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O[C@@H]2[C@H](O[C@@H]([C@@H]([C@H]2O)N)O[C@@H]3[C@@H]([C@H]([C@H](O[C@@H]3C(=O)O)O[C@H]4[C@@H]([C@H](O[C@@H]([C@H]4O)O[C@@H]5[C@@H]([C@H](O[C@@H]([C@H]5O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)[C@H](CO)O)O[C@@H]7[C@@H](C[C@@](O[C@@H]7[C@@H](CO[C@@H]8[C@@H]([C@H]([C@H](CO8)N)O)O)O)(C(=O)O)O)O[C@@]9(C[C@H]([C@H]([C@H](O9)[C@@H](CO)O)O)O)C(=O)O)O)[C@H](CO[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@H](CO)O)O)O)O)OP(=O)(O)OCCN)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@@H](CO)O)O)O)O[C@@H]1[C@H]([C@H]([C@@H]([C@H](O1)[C@H](CO)O)O)O)O)O)COP(=O)(O)OCCN)CO)O)O[C@@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O\\nThe natural language question is: The molecule is a 6-alkylaminopurine that is adenine where one of the hydrogens of the amino group is replaced by a hydroxymethyl group. It has a role as a human metabolite. It is a 6-alkylaminopurine, a nucleobase analogue and a hemiaminal. It derives from an adenine.\\nThe corresponding SMILES representation is:\\nC1=NC2=C(N1)C(=NC=N2)NCO\\nNext, you will be given a sample for test.The natural language question is: The molecule is a dicarboxylic acid dianion resulting from the removal of a proton from both of the carboxy groups of heme d trans-diol. It is a conjugate base of a heme d trans-diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)[C@@]([C@]3(CCC(=O)[O-])O)(C)O)CCC(=O)[O-].[Fe]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_238',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an S-acyl-4-phosphopantetheine obtained by deprotonation of the phosphate OH groups of S-octanoyl-4'-phosphopantetheine; major species at pH 7.3. It is a conjugate base of a S-octanoyl-4'-phosphopantetheine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8569',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a monocarboxylic acid that is toluene in which one of the hydrogens of the methyl group has been replaced by a carboxy group. It has a role as a toxin, a human metabolite, an Escherichia coli metabolite, a plant metabolite, a Saccharomyces cerevisiae metabolite, an EC 6.4.1.1 (pyruvate carboxylase) inhibitor, an Aspergillus metabolite, a plant growth retardant, an allergen and an auxin. It is a monocarboxylic acid, a member of benzenes and a member of phenylacetic acids. It derives from an acetic acid. It is a conjugate acid of a phenylacetate.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)CC(=O)O\\nThe natural language question is: The molecule is a glycosylglucose consisting of two D-glucopyranose units connected by a beta-(1->6)-linkage. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H](C(O2)O)O)O)O)O)O)O)O\\nThe natural language question is: The molecule is a fatty acid-taurine conjugate derived from tetracosanoic acid. It has a role as a mouse metabolite. It derives from a tetracosanoic acid. It is a conjugate acid of a N-tetracosanoyltaurine(1-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCC(=O)NCCS(=O)(=O)O\\nThe natural language question is: The molecule is dianion of oxidized Watasenia luciferin arising from deprotonation of both sulfate OH groups; major species at pH 7.3. It is a conjugate base of an oxidized Watasenia luciferin.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)CC2=NC(=CN=C2NC(=O)CC3=CC=C(C=C3)OS(=O)(=O)[O-])C4=CC=C(C=C4)OS(=O)(=O)[O-]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 17-oxo steroid that is andrastin D in which the keto group at position 3 has undergone formal reduction to give the corresponding 3alpha-hydroxy compound. It is a 3alpha-hydroxy steroid, a 15-hydroxy steroid, a 5beta steroid, a meroterpenoid, an enol, a methyl ester, a 17-oxo steroid and a 3beta-hydroxy-4,4-dimethylsteroid. It derives from an andrastin D. It is a conjugate acid of an andrastin E(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C[C@H]2[C@@]3(CC[C@H](C([C@H]3CC[C@@]2([C@]4([C@@]1(C(=C(C4=O)C)O)C)C(=O)OC)C)(C)C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5375',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of phenols that is phenol in which the hydrogen at position 2 is replaced by a polyprenyl group. It is a member of phenols, an olefinic compound and a polymer.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC1=CC=CC=C1O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15146',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organobromine compound that is fluoranthene in which the hydrogens at positions 7 and 10 are substituted by 4-bromophenyl groups, while those at positions 8 and 9 are substituted by 4-octylphenyl groups. It derives from a hydride of a fluoranthene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCC1=CC=C(C=C1)C2=C(C(=C3C4=CC=CC5=C4C(=CC=C5)C3=C2C6=CC=C(C=C6)Br)C7=CC=C(C=C7)Br)C8=CC=C(C=C8)CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10368',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a macrocyclic organosiloxane composed from seven units of dimethylsiloxane. It has a role as a marine xenobiotic metabolite. It is an organosiloxane and a macrocycle.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[Si]1(O[Si](O[Si](O[Si](O[Si](O[Si](O[Si](O1)(C)C)(C)C)(C)C)(C)C)(C)C)(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13712',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a hydroxyaurone that is aurone substituted by hydroxy groups at positions 6 and 4' respectively. It has a role as a plant metabolite. It derives from an aurone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC(=CC=C1/C=C\\\\\\\\2/C(=O)C3=C(O2)C=C(C=C3)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28274',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a diarylheptanoid that is (6E)-6-heptene substituted by an acetoxy group at position 3, a 3,4-dihydroxyphenyl group at position 1 and a 4-hydroxyphenyl group at position 7 (the 3R-stereoisomer). It has been isolated from the rhizomes of Curcuma kwangsiensis. It has a role as a plant metabolite. It is a diarylheptanoid, an acetate ester and a member of catechols.\\nThe corresponding SMILES representation is:\\nCC(=O)O[C@H](CC/C=C/C1=CC=C(C=C1)O)CCC2=CC(=C(C=C2)O)O\\nThe natural language question is: The molecule is an L-phenylalanine derivative that is the amide obtained by formal condensation of the carboxy group of L-phenylalanine with the amino group of 2-naphthylamine. It has a role as a chromogenic compound. It is a N-(2-naphthyl)carboxamide, an amino acid amide and a L-phenylalanine derivative.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)C[C@@H](C(=O)NC2=CC3=CC=CC=C3C=C2)N\\nThe natural language question is: The molecule is an organic nitrate salt obtained by reaction of equimolar amounts of arasertaconazole and nitric acid. The active R-enantiomer of sertaconazole nitrate that is used for treatment of vulvovaginal candidiasis. The racemate itself is also used as a broad-spectrum antifungal drug. It is an organic nitrate salt, a conazole antifungal drug and an imidazole antifungal drug. It contains an arasertaconazole(1+). It is an enantiomer of a (S)-sertaconazole nitrate.\\nThe corresponding SMILES representation is:\\nC1=CC2=C(C(=C1)Cl)SC=C2CO[C@@H](CN3C=CN=C3)C4=C(C=C(C=C4)Cl)Cl.[N+](=O)(O)[O-]\\nThe natural language question is: The molecule is a trisaccharide derivative in which alpha-D-galactosaminyl-(1->4)-alpha-D-galactosaminyl-(1->4)-alpha-D-galactosamine is linked glycosidically to biotin via a (21-oxo-3,6,9,12,15,18-hexaoxa-22-azapentacosan-1-yl)amino spacer. One of a set of synthesised biotinylated oligo-alpha-(1->4)-D-galactosamines comprising from two to six monosaccharide units, along with their N-acetylated derivatives (PMID:31913631), aimed at analysing the specificity of the antibody responses to a complex exopolysaccharide galactosaminogalactan found in Aspergillus fumigatus, the most important airborne human fungal pathogen in industrialized countries. It is a trisaccharide derivative and a member of biotins.\\nThe corresponding SMILES representation is:\\nC1[C@H]2[C@@H]([C@@H](S1)CCCCC(=O)NCCOCCOCCOCCOCCOCCOCCC(=O)NCCCO[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O[C@@H]4[C@@H]([C@H]([C@H]([C@H](O4)CO)O[C@@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O)N)O)N)O)N)NC(=O)N2\\nNext, you will be given a sample for test.The natural language question is: The molecule is a naphthochromene that is 4H-naphtho[2,3-h]chromene-4,7,12-trione substituted by a chloro group at position 10, hydroxy groups at positions 5, 9 and 11 and a methyl group at position 2. It is isolated from a fungal strain Phoma sp.BAUA2861 and acts as an inhibitor of the enzyme topoisomerase I. It has a role as a metabolite, an EC 5.99.1.2 (DNA topoisomerase) inhibitor, an antiviral agent, an antineoplastic agent and an antimicrobial agent. It is an organochlorine compound, a naphthochromene, a member of phenols and a member of p-quinones.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=O)C2=C(C=C3C(=C2O1)C(=O)C4=C(C(=C(C=C4C3=O)O)Cl)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22698',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a C27 bile acid that is 5beta-cholest-24-en-26-oic acid substituted by hydroxy groups at positions 3alpha and 7beta. It has a role as a metabolite. It is a cholestanoid, a 3alpha-hydroxy steroid, a dihydroxy monocarboxylic acid, a bile acid, a 7beta-hydroxy steroid and an alpha,beta-unsaturated monocarboxylic acid. It derives from a hydride of a 5beta-cholestane.\\nThe corresponding SMILES representation is:\\nC[C@H](CC/C=C(\\\\\\\\C)/C(=O)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)C\\nThe natural language question is: The molecule is an organophosphate insecticide and a dialkyl aryl phosphate. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor and an agrochemical. It derives from a 4-(methylsulfanyl)phenol.\\nThe corresponding SMILES representation is:\\nCCCOP(=O)(OCCC)OC1=CC=C(C=C1)SC\\nThe natural language question is: The molecule is a tripeptide composed of L-alanine, glycine, and L-tyrosine joined in sequence by peptide linkages. It has a role as a metabolite. It derives from a L-alanine, a glycine and a L-tyrosine.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)NCC(=O)N[C@@H](CC1=CC=C(C=C1)O)C(=O)O)N\\nThe natural language question is: The molecule is a N-glycosyl compound, a ribose monophosphate, an aminopyrimidine and a hydroxypyrimidine. It is a conjugate acid of a 2,5-diamino-4-hydroxy-6-(5-phosphonatoribosylamino)pyrimidine(3-).\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@H]([C@@H](O1)NC2=C(C(=O)NC(=N2)N)N)O)O)OP(=O)(O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 3beta-sterol that is 5alpha-stigmasta-7,24(28)-dien-3beta-ol which is substituted at the 4alpha position by a hydroxymethyl group and in which the 24(28) double bond has Z configuration. It is a 3beta-sterol, a member of phytosterols and a Delta(7)-sterol. It derives from a (Z)-24-ethylidenelophenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C=C(/CC[C@@H](C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=CC[C@@H]4[C@@]3(CC[C@@H]([C@H]4CO)O)C)C)\\\\\\\\C(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5845',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a dipeptide obtained by formal condensation of the carboxy group of L-asparagine with the amino group of L-isoleucine. It is a dipeptide, a primary carboxamide, a secondary carboxamide, a primary amino compound and a carboxylic acid. It derives from a L-asparagine and a L-isoleucine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@@H](C(=O)O)NC(=O)[C@H](CC(=O)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10129',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a disaccharide that is beta-D-glucopyranose in which the hydroxy group at position 4 has been converted into the corresponding alpha-D-galactopyranoside. It is an alpha-D-galactoside and a glycosylglucose. It derives from a beta-D-glucose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O[C@@H]2[C@H](O[C@H]([C@@H]([C@H]2O)O)O)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19015',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a racemate composed of equimolar amounts of dextrobupivacaine hydrochloride hydrate and levobupivacaine hydrochloride hydrate. A piperidinecarboxamide-based local anaesthetic, it has a slow onset and long duration of action. It has a role as a local anaesthetic, an adrenergic antagonist, an amphiphile, an EC 3.1.1.8 (cholinesterase) inhibitor and an EC 3.6.3.8 (Ca(2+)-transporting ATPase) inhibitor. It contains a bupivacaine hydrochloride (anhydrous), a levobupivacaine hydrochloride hydrate and a dextrobupivacaine hydrochloride hydrate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCN1CCCCC1C(=O)NC2=C(C=CC=C2C)C.O.Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24394',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an arene epoxide that is tetraphene which has undergone 1,2 addition of an oxygen atom to the double bond at the 5-6 position. It has a role as a mutagen. It is an arene epoxide and an organic heteropentacyclic compound. It derives from a hydride of a tetraphene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C=C3C4=CC=CC=C4C5C(C3=CC2=C1)O5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16308',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a cresol with the methyl substituent at position 3. It is a minor urinary metabolite of toluene. It has a role as a human xenobiotic metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=CC=C1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23297',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a tetrapeptide composed of L-arginine, two L-phenylalanine units and L-cysteine joined in sequence by peptide linkages. It has a role as a metabolite. It derives from a L-arginine, a L-phenylalanine and a L-cysteine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C[C@@H](C(=O)N[C@@H](CC2=CC=CC=C2)C(=O)N[C@@H](CS)C(=O)O)NC(=O)[C@H](CCCN=C(N)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14757',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a branched amino octasaccharide consisting of a linear hexasaccharide chain of alpha-D-galactose, N-acetyl-beta-D-glucosamine, beta-D-galactose, beta-D-glucose, L-glycero-alpha-D-manno-heptose (Hep) and 3-deoxy-D-manno-oct-2-ulose (Kdo) residues linked in a (1->4), (1->3), (1->4), (1->4), (1->5) sequence, to the Hep residue of which is linked (1->3) an N-acetyl-alpha-D-glucosaminyl-(1->2)-L-glycero-alpha-D-manno-heptosyl side-chain. lpt3 mutant of the core oligosaccharide of Neisseria meningitidis.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1O[C@H]2[C@H]([C@@H]([C@H](O[C@@H]2O[C@@H]3[C@@H]([C@H](O[C@@H]([C@H]3O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O[C@H]5[C@@H]([C@H]([C@H]([C@H](O5)CO)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO)O)O)O)O)NC(=O)C)O)O)O)[C@H](CO)O)O[C@@H]8[C@@H](CC(O[C@@H]8[C@@H](CO)O)(C(=O)O)O)O)O)[C@H](CO)O)O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17164',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an omega-carboxyacyl-CoA(5-) obtained by deprotonation of the phosphate, diphosphate and carboxy groups of any 3-carboxy-3-hydroxypropanoyl-CoA; major species at pH 7.3. It is an omega-carboxyacyl-CoA(5-) and a monocarboxylic acid anion. It is a conjugate base of a 3-carboxy-3-hydroxypropanoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)CC(C(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12733',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a desulfoglucosinolic acid resulting from the formal condensation of the thiol group of (3S)-N,3-dihydroxypent-4-enethioamide with beta-D-glucopyranose. It has a role as an Arabidopsis thaliana metabolite. It is a desulfoglucosinolic acid and a secondary allylic alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C=C[C@H](CC(=NO)SC1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4579',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 9-HODE in which the 9-hydroxy group has S-stereochemistry. It is a conjugate acid of a 9(S)-HODE(1-). It is an enantiomer of a 9(R)-HODE.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C=C\\\\\\\\[C@H](CCCCCCCC(=O)O)O\\nThe natural language question is: The molecule is a member of the class of trichlorophenols that is isophthalonitrile substituted at positions 2, 4 and 5 by chloro groups and at position 6 by a hydroxy group. The major metabolite of chlorothalonil. It has a role as a bacterial xenobiotic metabolite. It is a member of trichlorophenols and a nitrile. It derives from an isophthalonitrile.\\nThe corresponding SMILES representation is:\\nC(#N)C1=C(C(=C(C(=C1Cl)C#N)Cl)Cl)O\\nThe natural language question is: The molecule is the conjugate base of N-formylmaleamic acid arising from deprotonation of the carboxy group. It derives from a maleamate. It is a conjugate base of a N-formylmaleamic acid.\\nThe corresponding SMILES representation is:\\nC(=C\\\\\\\\C(=O)[O-])\\\\\\\\C(=O)NC=O\\nThe natural language question is: The molecule is a branched-chain saturated fatty acid comprising tetradecanoic acid carrying a methyl substituent at positions 10 and 13. It is a marine metabolite isolated from the Caribbean Sponges Calyx podatypa and Agelas dispar. It has a role as an animal metabolite and a marine metabolite. It is a branched-chain saturated fatty acid, a long-chain fatty acid and a methyl-branched fatty acid.\\nThe corresponding SMILES representation is:\\nCC(C)CCC(C)CCCCCCCCC(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a beta-D-glucosyl-N-acylsphingosine in which the acyl group is specified as eicosanoyl. It has a role as a mouse metabolite. It derives from an icosanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)[C@@H](/C=C/CCCCCCCCCCCCC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12736',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an aryl thiol that is thiophenol substituted at position 2 by an amino group. It has a role as a plant metabolite. It is a substituted aniline and an aryl thiol.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C(=C1)N)S\\nThe natural language question is: The molecule is a steroidal acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of deoxycholic acid. It derives from a coenzyme A and a deoxycholic acid. It is a conjugate acid of a deoxycholoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nC[C@H](CCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)[C@H]4CC[C@@H]5[C@@]4([C@H](C[C@H]6[C@H]5CC[C@H]7[C@@]6(CC[C@H](C7)O)C)O)C\\nThe natural language question is: The molecule is a member of the class of rifamycins that is rifamycin SV lacking the O-methyl group at position 27. It has a role as a bacterial metabolite. It is an acetate ester, a cyclic ketal, a lactam, a macrocycle, a polyphenol and a member of rifamycins. It is a conjugate acid of a 27-O-demethylrifamycin SV(1-).\\nThe corresponding SMILES representation is:\\nC[C@H]1/C=C/C=C(\\\\\\\\C(=O)NC2=CC(=C3C(=C2O)C(=C(C4=C3C(=O)[C@](O4)(O/C=C/[C@@H]([C@H]([C@H]([C@@H]([C@@H]([C@@H]([C@H]1O)C)O)C)OC(=O)C)C)O)C)C)O)O)/C\\nThe natural language question is: The molecule is a guanosine 5'-phosphate that is the N(7)-methyl derivative of guanosine 5'-monophosphate. It is an ammonium betaine and a guanosine 5'-phosphate. It derives from a guanosine 5'-monophosphate. It is a conjugate base of a 7-methylguanosine 5'-phosphate(1+). It is a conjugate acid of a 7-methylguanosine 5'-phosphate(1-).\\nThe corresponding SMILES representation is:\\nCN1C=[N+](C2=C1C(=O)NC(=N2)N)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)[O-])O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is 1,4-Dihydro-2H-3,1-benzoxazin-2-one substituted at the 4 position by cyclopropylethynyl and trifluoromethyl groups (S configuration) and at the 6 position by chlorine. A non-nucleoside reverse transcriptase inhibitor with activity against HIV, it is used with other antiretrovirals for combination therapy of HIV infection. It has a role as a HIV-1 reverse transcriptase inhibitor and an antiviral drug. It is a benzoxazine, an acetylenic compound, an organochlorine compound, an organofluorine compound and a member of cyclopropanes.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1CC1C#C[C@]2(C3=C(C=CC(=C3)Cl)NC(=O)O2)C(F)(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7802',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an (omega-1)-hydroxy fatty acid ascaroside obtained by formal condensation of the alcoholic hydroxy group of (18R)-18-hydroxynonadecanoic acid with ascarylopyranose (the alpha anomer). It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a monocarboxylic acid and an (omega-1)-hydroxy fatty acid ascaroside. It derives from a (18R)-18-hydroxynonadecanoic acid. It is a conjugate acid of an ascr#34(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCCCCCCCCCCCCC(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1852',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a 1,2-diacyl-sn-glycerol 3-diphosphate(3-) arising from deprotonation of the diphosphate OH groups of 1,2-dioctanoyl-sn-glycerol 3-diphosphate. It is a conjugate base of a 1,2-dioctanoyl-sn-glycerol 3-diphosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OP(=O)([O-])[O-])OC(=O)CCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9328',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a dichlorobenzene, a sulfone, an aromatic ketone and a benzoylpyrazole. It has a role as an EC 1.13.11.27 (4-hydroxyphenylpyruvate dioxygenase) inhibitor.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C)N1C(=O)C(=C(N1)C2=C(C=C(C=C2)Cl)Cl)C(=O)C3=C(C=C(C=C3)S(=O)(=O)C)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20258',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a menaquinol whose structure comprises a 2-methylbenzohydroquinone nucleus and a side chain of twelve isoprenoid units. It has a role as an electron donor.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C2=CC=CC=C2C(=C1C/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28611',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organic disulfide that is didecyl disulfide in which the hydrogens of the terminal methyl groups are replaced by 3,4-dimethoxy-5-methyl-2H-pyran-2-ones groups respectively. It has been isolated from the marine sponge of the genus Plakortis. It has a role as an animal metabolite. It is a member of 2-pyranones, an ether, a polyketide and an organic disulfide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(OC(=O)C(=C1OC)OC)CCCCCCCCCCSSCCCCCCCCCCC2=C(C(=C(C(=O)O2)OC)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8582',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a dipeptide formed from two L-histidine residues. It has a role as a Mycoplasma genitalium metabolite. It derives from a L-histidine.\\nThe corresponding SMILES representation is:\\nC1=C(NC=N1)C[C@@H](C(=O)N[C@@H](CC2=CN=CN2)C(=O)O)N\\nThe natural language question is: The molecule is a glycinyl ester obtained by the formal condensation of the carboxy group of glycine with methanol. It has a role as a metabolite.\\nThe corresponding SMILES representation is:\\nCOC(=O)CN\\nThe natural language question is: The molecule is a 3-hydroxy fatty acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of (3R,19Z,22Z,25Z,28Z,31Z)-3-hydroxytetratriacontapentaenoyl-CoA; major species at pH 7.3. It is a (R)-3-hydroxyacyl-CoA(4-), a 3-hydroxy fatty acyl-CoA(4-) and an 11,12-saturated fatty acyl-CoA(4-). It is a conjugate base of a (3R,19Z,22Z,25Z,28Z,31Z)-3-hydroxytetratriacontapentaenoyl-CoA.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCC[C@H](CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)O\\nThe natural language question is: The molecule is a tetrachlorobenzene formed formally by chlorination of biphenyl-4,4'-diol at C-2, -3, -5 and -6. It derives from a biphenyl-4,4'-diol.\\nThe corresponding SMILES representation is:\\nC1=CC(=CC=C1C2=C(C(=C(C(=C2Cl)Cl)O)Cl)Cl)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an organosulfinic acid that is methanesulfinic acid in which one of the methyl hydrogens has been replaced by a formyl group. It is an organosulfinic acid and an alpha-CH2-containing aldehyde. It is a conjugate acid of a 2-sulfinoacetaldehyde(1-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C(C=O)S(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5432',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a hydroxyaldehyde that is pentanal carrying three hydroxy substituents at positions 3, 4 and 5. It is a deoxypentose, a hydroxyaldehyde and a triol. It derives from a hydride of a pentanal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C=O)C(C(CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6428',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an acetate ester obtained from the formal condensation of acetic acid with the hydroxy group at position 1 of 6-[(8Z)-pentadec-8-en-1-yl]benzene-1,2,4-triol. Isolated from the dried fruits of Ardisia colorata, it exhibits scavenging activity towards DPPH radicals and cytotoxicity against murine breast cancer cell line, FM3A. It has a role as a metabolite, an antineoplastic agent and a radical scavenger. It is a member of resorcinols and an acetate ester. It derives from a 6-[(8Z)-pentadec-8-en-1-yl]benzene-1,2,4-triol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C\\\\\\\\CCCCCCCC1=C(C(=CC(=C1)O)O)OC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8927',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is lipid IVA glycosylated with two 3-deoxy-D-manno-octulosonic acid (KDO) residues. It has a role as an Escherichia coli metabolite. It is a conjugate acid of a (KDO)2-lipid IVA(6-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCC[C@H](CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)(O)O)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO[C@@]3(C[C@H]([C@H]([C@H](O3)[C@@H](CO)O)O)O[C@@]4(C[C@H]([C@H]([C@H](O4)[C@@H](CO)O)O)O)C(=O)O)C(=O)O)OP(=O)(O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)NC(=O)C[C@@H](CCCCCCCCCCC)O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14397',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a C-glycosyl compound that is 1,8-dihydroxy-3-methylanthracen-9(10H)-one substituted by a 1-O-acetyl-3-O-senecioyl-alpha-L-lyxopyranosyl moiety at position 10 via a C-glycosidic linkage (the 10R stereoisomer). It is isolated from the leaves of Alvaradoa haitiensis and exhibits cytotoxicity against human oral epidermoid carcinoma. It has a role as a metabolite and an antineoplastic agent. It is a C-glycosyl compound, an acetate ester, a member of anthracenes and a polyphenol. It derives from a 3-methylbut-2-enoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC2=C(C(=C1)O)C(=O)C3=C([C@H]2[C@H]4[C@@H]([C@H]([C@H]([C@@H](O4)OC(=O)C)O)OC(=O)C=C(C)C)O)C=CC=C3O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22468',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a kaempferol O-glucuronide that is kaempferol with a beta-D-glucosiduronic acid residue attached at the 5-position. It has a role as a metabolite. It is a kaempferol O-glucuronide and a trihydroxyflavone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C2=C(C(=O)C3=C(O2)C=C(C=C3O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C(=O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9351',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a 3alpha-hydroxy steroid, a 7alpha-hydroxy steroid, a 12alpha-hydroxy steroid, a 24-hydroxy steroid and a 26-hydroxy steroid. It has a role as a bile acid metabolite. It derives from a hydride of a 5beta-cholestane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(C(C)CO)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9872',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an icosatetraenoic acid in which the four double bonds are located at positions 5, 8, 10 and 12 (the 5Z,8Z,10E,12E-isomer).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCC/C=C/C=C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26775',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxamic acid derived from phenylacetamide in which the benzene moiety is substituted at C-4 by a butoxy group. It has anti-inflammatory, analgesic, and antipyretic properties. It has a role as a non-narcotic analgesic, a non-steroidal anti-inflammatory drug and an antipyretic. It is a hydroxamic acid and an aromatic ether.\\nThe corresponding SMILES representation is:\\nCCCCOC1=CC=C(C=C1)CC(=O)NO\\nThe natural language question is: The molecule is an amino disaccharide consisting of 2-amino-2-deoxy-alpha-D-glucopyranose and 2-amino-2-deoxy-D-glucopyranose residues joined in sequence by a (1->6) glycosidic bond. It is an amino disaccharide and a primary amino compound. It derives from an alpha-D-glucosamine and a 2-amino-2-deoxy-D-glucopyranose.\\nThe corresponding SMILES representation is:\\nC([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H](C(O2)O)N)O)O)N)O)O)O\\nThe natural language question is: The molecule is a cyclic dicarboxylic anhydride that is the cyclic anhydride of hexahydrophthalic acid. It has a role as an allergen. It is a cyclic dicarboxylic anhydride and a tetrahydrofurandione.\\nThe corresponding SMILES representation is:\\nC1CCC2C(C1)C(=O)OC2=O\\nThe natural language question is: The molecule is an apo carotenoid triterpenoid that is tetracosane containing double bonds at the 2-3, 6-7, 8-9, 10-11, 12-13, 14-15, 16-17, 18-19, and 22-23 positions, and substituted by methyl groups at positions 2, 6, 10, 15, 19, and 23. It is an apo carotenoid triterpenoid, a triterpene and a polyene.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/C=C/C(=C/C=C/C=C(/C=C/C=C(/CCC=C(C)C)\\\\\\\\C)\\\\\\\\C)/C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a furan having a nitro group at position 5 and a carboxamido group in turn bearing a long-chain multifunctional N-alkyl group at position 2. It is a C-nitro compound, a member of furans, a member of indoles, a polyether and a monocarboxylic acid amide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=CN2)CCCC(=O)NCCOCCOCCOCCOCCOCCOCCOCCOCCC(=O)NC(CCCCNC(=O)C3=CC=C(O3)[N+](=O)[O-])C(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23750',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an alkene that is propane with a double bond at position 1. It has a role as a refrigerant and a xenobiotic. It is an alkene and a gas molecular entity.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC=C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23743',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a member of the class of benzochromenones that is urolithin A in which the phenolic hydrogen at position 8 has been replaced by a beta-D-glucuronosyl group. It has a role as a human urinary metabolite. It is a beta-D-glucosiduronic acid, a benzochromenone, a monosaccharide derivative and a member of phenols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1O)OC(=O)C3=C2C=CC(=C3)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)C(=O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26561',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a carboxyalkyl phosphate that is pyruvic acid substituted at position 3 by a 3-phosphonooxy group. It derives from a pyruvic acid. It is a conjugate acid of a 3-phosphonatooxypyruvate(3-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(C(=O)C(=O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7749',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an aminoacridine that is acridine in which the hydrogen at position 9 is replaced by an amino group. A fluorescent dyd and topical antiseptic agent, it is used (usually as the hydrochloride salt) in eye drops for the treatment of superficial eye infections. It has a role as an antiinfective agent, an antiseptic drug, a fluorescent dye, a MALDI matrix material, an acid-base indicator and a mutagen. It is a member of aminoacridines and a primary amino compound. It is a conjugate base of a 9-aminoacridine(1+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)C(=C3C=CC=CC3=N2)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25808',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an (S)-3-hydroxyacyl-CoA resulting from the formal condensation of the thiol group of coenzyme A with the 1-carboxy group of (3S)-hydroxyhexadecanedioic acid. It derives from a hexadecanedioic acid. It is a conjugate acid of a (3S)-hydroxyhexadecanedioyl-CoA(5-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)C[C@H](CCCCCCCCCCCCC(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15614',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a cembrane diterpenoid that is cembra-2E,7E,11Z-trien-20,10-olide substituted by hydroxy groups at positions 1 and 4. It has been isolated from the leaves of Croton gratissimus. It has a role as a metabolite. It is a cembrane diterpenoid, a diterpene lactone, a macrocycle and a tertiary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C/1=C\\\\\\\\CC[C@@](/C=C/[C@](CCC2=C[C@@H](C1)OC2=O)(C(C)C)O)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4658',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an N-hydroxy-L-polyhomomethionine in which there are eight methylene groups between the alpha-carbon and sulfur atoms. It is a N-hydroxy-L-polyhomomethionine and a N-hydroxyhexahomomethionine. It is a conjugate acid of a N-hydroxy-L-hexahomomethioninate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CSCCCCCCCC[C@@H](C(=O)O)NO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24084',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a sulfonamide that is benzenesulfonamide substituted by an acetylamino group at position 4 and a 4,6-dimethoxy-pyrimidin-2-yl group at the nitrogen atom. It is a metabolite of the sulfonamide antibiotic sulfadimethoxine. It has a role as a marine xenobiotic metabolite. It is a sulfonamide, a member of acetamides and a member of pyrimidines.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)NC1=CC=C(C=C1)S(=O)(=O)NC2=NC(=CC(=N2)OC)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27489',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a proanthocyanidin consisting of two molecules of (-)-epicatechin joined by a bond between positions 4 and 6' in beta-configuration. It can be found in grape seeds, in Hibiscus cannabinus (kenaf) root and bark, in apple and in cacao. It has a role as a metabolite. It is a hydroxyflavan, a proanthocyanidin, a biflavonoid and a polyphenol. It derives from a (-)-epicatechin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@H]([C@H](OC2=C1C(=C(C(=C2)O)[C@@H]3[C@H]([C@H](OC4=CC(=CC(=C34)O)O)C5=CC(=C(C=C5)O)O)O)O)C6=CC(=C(C=C6)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16642',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion arising from deprotonation of both free diphosphate OH groups of beta-D-Glc-(1->3)-alpha-D-GlcNAc-diphospho-ditrans,octacis-undecaprenol It is a conjugate base of a beta-D-Glc-(1->3)-alpha-D-GlcNAc-diphospho-ditrans,octacis-undecaprenol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])OP(=O)([O-])O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16271',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a dihydroxy- and monomethoxy-flavone in which the hydroxy groups are positioned at C-5 and C-7 and the methoxy group is at C-6. It has a role as an antineoplastic agent and an EC 1.14.13.39 (nitric oxide synthase) inhibitor. It is a monomethoxyflavone and a dihydroxyflavone. It is a conjugate acid of an oroxylin A(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC=CC=C3)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16189',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a 2-acyl-sn-glycero-3-phosphoethanolamine zwitterion obtained by transfer of a proton from the phosphate to the amino group of 2-arachidonoyl-sn-glycero-3-phosphoethanolamine; major species at pH 7.3. It derives from an arachidonic acid. It is a tautomer of a 2-arachidonyl-sn-glycero-3-phosphoethanolamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O[C@H](CO)COP(=O)([O-])OCC[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9658',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an O-acyl-D-carnitine in which the acyl group specified as octanoyl. It is an O-octanoylcarnitine and an O-acyl-D-carnitine. It is an enantiomer of an O-octanoyl-L-carnitine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCC(=O)O[C@@H](CC(=O)[O-])C[N+](C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14932',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a trisaccharide that is beta-D-galactopyranose in which the hydroxy groups at positions 2 and 3 have been converted into the corresponding beta-D-glucopyranosyl and beta-D-mannopyranosyl derivatives, respectively. It derives from a beta-D-Glcp-(1->2)-beta-D-Galp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4617',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a monocarboxylic acid anion resulting from the deprotonation of the carboxy group of (9Z)-18-hydroxyoctadec-9-enoic acid 18-O-beta-D-glucoside. It is a conjugate base of a (9Z)-18-hydroxyoctadec-9-enoic acid 18-O-beta-D-glucoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCO[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O)CCC/C=C\\\\\\\\CCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22852',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a stilbenoid that is the (-)-trans-stereoisomer of epsilon-viniferin, obtained by cyclodimerisation of trans-resveratrol. It has a role as a metabolite. It is a member of 1-benzofurans, a polyphenol and a stilbenoid. It derives from a trans-resveratrol. It is an enantiomer of a (+)-trans-epsilon-viniferin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C/C2=C3[C@H]([C@@H](OC3=CC(=C2)O)C4=CC=C(C=C4)O)C5=CC(=CC(=C5)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11478',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an N-(long-chain-acyl)ethanolamine that is the ethanolamide of octadecanoic acid. It is a N-(long-chain-acyl)ethanolamine, a N-(saturated fatty acyl)ethanolamine and a N-acylethanolamine 18:0. It derives from an octadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCC(=O)NCCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11361',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an icosanoid that is (6E,8Z,10E,14Z)-icosatetraenoic acid substituted at positions 5 and 12 by oxo and hydroxy groups respectively. It has a role as a human blood serum metabolite. It is an enone, an icosanoid, a long-chain fatty acid, an oxo fatty acid and a hydroxy polyunsaturated fatty acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C[C@@H](/C=C/C=C\\\\\\\\C=C\\\\\\\\C(=O)CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5500',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of the class of indoles that is 1H-indole carrying methoxy and isothiocyanatomethyl substituents at positions 1 and 3 respectively. It has a role as an Arabidopsis thaliana metabolite. It is an isothiocyanate and a member of indoles.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CON1C=C(C2=CC=CC=C21)CN=C=S'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16965',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a C-glycosyl compound that is isovitexin with the hydroxy group at position 7 replaced with a glucopyranosyl entity which in turn is substituted at position 6 by an isoferuloyl moiety. It has a role as a metabolite. It is a dihydroxyflavone, a cinnamate ester and a C-glycosyl compound. It derives from an isovitexin and an isoferulic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=C(C=C1)/C=C/C(=O)OCC2[C@H]([C@@H]([C@H](C(O2)OC3=C(C(=C4C(=C3)OC(=CC4=O)C5=CC=C(C=C5)O)O)[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1115',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an amino trisaccharide comprised of alpha-neuraminic acid, N-acetyl-beta-D-galactosamine and N-acetyl-beta-D-glucosamine residues linked sequentially (2->3) and (1->4). It is an amino trisaccharide, a galactosamine oligosaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H](C[C@@](O[C@H]1[C@@H]([C@@H](CO)O)O)(C(=O)O)O[C@@H]2[C@H]([C@@H](O[C@@H]([C@@H]2O)CO)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O)NC(=O)C)O)CO)NC(=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14340',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a 3-oxo-fatty acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of (8Z,11Z,14Z,17Z)-3-oxoicosa-8,11,14,17-tetraenoyl-CoA. It is a conjugate base of an (8Z,11Z,14Z,17Z)-3-oxoicosatetraenoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28108',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a phosphatidylethanolamine 32:0 zwitterion obtained by transfer of a proton from the phosphate to the amino group of 1,2-dihexadecanoyl-sn-glycero-3-phosphoethanolamine. It is a tautomer of a 1,2-dihexadecanoyl-sn-glycero-3-phosphoethanolamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18958',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of benzofurans that is 2,3-dihydro-1-benzofuran substituted by a carboxy group at position 5, a prenyl group at position 7 and a 6-methylhepta-2,5-dien-2-yl group at position 2. Isolated from Myrsine seguinii, it exhibits anti-inflammatory activity. It has a role as a metabolite, an anti-inflammatory agent and an EC 4.4.1.11 (methionine gamma-lyase) inhibitor. It is a member of 1-benzofurans and a monocarboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CC/C=C(\\\\\\\\C)/C1CC2=C(O1)C(=CC(=C2)C(=O)O)CC=C(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18359',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a pentacyclic triterpenoid that is olean-12-ene substituted by carboxy groups at positions 23 and 28 and a hydroxy group at position 3 (the 3beta stereoisomer). It has a role as a metabolite and an antibacterial agent. It is a pentacyclic triterpenoid and a hydroxy carboxylic acid. It is a conjugate acid of a gypsogenate(2-). It derives from a hydride of an oleanane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@@H]([C@@]([C@@H]1CC[C@@]3([C@@H]2CC=C4[C@]3(CC[C@@]5([C@H]4CC(CC5)(C)C)C(=O)O)C)C)(C)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19306',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an alkyl sulfate that is the mono(2-ethylhexyl) ester of sulfuric acid. It has a role as a surfactant and a carcinogenic agent. It derives from a 2-ethylhexan-1-ol. It is a conjugate acid of a 2-ethylhexyl sulfate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC(CC)COS(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20999',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 25-hydroxy steroid, a 12alpha-hydroxy steroid, a 3alpha-hydroxy steroid and a 7alpha-hydroxy steroid. It derives from a hydride of a 5alpha-cholane.\\nThe corresponding SMILES representation is:\\nC[C@H](CCCC(C)(C)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](C[C@@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C\\nThe natural language question is: The molecule is a 1,2-diacyl-sn-glycerol in which both acyl groups are specified as linoleoyl. It has a role as a mouse metabolite. It is a 1,2-diacyl-sn-glycerol and a dilinoleoylglycerol. It derives from a linoleic acid. It is an enantiomer of a 2,3-dilinoleoyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](CO)OC(=O)CCCCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC\\nThe natural language question is: The molecule is a member of the class of chalcones that is trans-chalcone substituted by methoxy groups at positions 2', 4 and 4' and hydroxy groups at positions 3' and 6'. It is a member of chalcones, a dimethoxybenzene and a member of hydroquinones. It derives from a trans-chalcone.\\nThe corresponding SMILES representation is:\\nCOC1=CC=C(C=C1)/C=C/C(=O)C2=C(C(=C(C=C2O)OC)O)OC\\nThe natural language question is: The molecule is a carboxylic ester obtained by the formal condensation of 2-phenylethanol with isobutyric acid. It has a role as a metabolite. It derives from a 2-phenylethanol and an isobutyric acid.\\nThe corresponding SMILES representation is:\\nCC(C)C(=O)OCCC1=CC=CC=C1\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of chromenes that is 2H-1-benzopyran substituted by methyl groups at positions 2 and 2, an ethyl group at position 4, a hydroxy group at position 7 and a 4-methoxyphenyl group at position 3 respectively. It is a member of chromenes, a monomethoxybenzene and a member of phenols.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCC1=C(C(OC2=C1C=CC(=C2)O)(C)C)C3=CC=C(C=C3)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12985',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a dicarboxylic acid monoanion. It is a conjugate base of a methotrexate. It is a conjugate acid of a methotrexate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6502',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a monocarboxylic acid anion that is the conjugate base of oscr#34, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an oscr#34.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCCCCCCCCCC(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15685',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a monohydroxybenzoic acid that is 4-hydroxybenzoic acid carrying an additional amino substitutent at position 3. It is an aminobenzoic acid and a monohydroxybenzoic acid. It is a conjugate acid of a 3-amino-4-hydroxybenzoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C=C1C(=O)O)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15185',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is the (-)-(7S,8S)-stereoisomer of guaiacylglycerol. It has been isolated from the stems of Sinocalamus affinis. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)[C@@H]([C@H](CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22333',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organofluorine compound that consists of 1,3-oxazolidin-2-one bearing an N-3-fluoro-4-(morpholin-4-yl)phenyl group as well as an acetamidomethyl group at position 5. A synthetic antibacterial agent that inhibits bacterial protein synthesis by binding to a site on 23S ribosomal RNA of the 50S subunit and prevents further formation of a functional 70S initiation complex. It has a role as an antibacterial drug and a protein synthesis inhibitor. It is an oxazolidinone, a member of morpholines, an organofluorine compound and a member of acetamides.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)NC[C@H]1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10086',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a trehalose monomycolate that is alpha,alpha-trehalose 6-phosphate carrying an additional mycolyl substituent at position 6'. It is a trehalose monomycolate, a monoacyl alpha,alpha-trehalose and a trehalose phosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCC([C@@H](CCCCCCCCCCCCCCCC1CC1CCCCCCCCCCC2CC2CCCCCCCCCCCCCCCCCC)O)C(=O)OC[C@@H]3[C@H]([C@@H]([C@H]([C@H](O3)O[C@@H]4[C@@H]([C@H]([C@@H]([C@H](O4)COP(=O)(O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6090',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a carbamate ester that is the isopropyl ester of 3-chlorophenylcarbamic acid. It has a role as a herbicide and a plant growth retardant. It is a carbamate ester, a member of benzenes and a member of monochlorobenzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)OC(=O)NC1=CC(=CC=C1)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8100',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a trisaccharide that is alpha-D-glucopyranosyl-(1->3)-D-galactopyranose in which the hydroxy group at position 4 of the D-galactopyranose moiety has been glycosylated by an alpha-L-fucopyranosyl group. It derives from an alpha-D-Glcp-(1->3)-D-Galp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)O[C@H]2[C@H](OC([C@@H]([C@H]2O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)O)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15661',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a phosphatidylinositol mannoside having the phosphatidyl moiety (with specific O-acyl groups) at the 1-position, a single mannosyl residue at the 2-position and a linear pentamannoside at position 6 of the inositol ring. It has a role as an antigen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)(O)O[C@@H]1[C@@H]([C@@H]([C@H]([C@@H]([C@H]1O[C@@H]2[C@H]([C@H]([C@@H]([C@H](O2)CO[C@@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O)O)O)O)O)O)O)O)O)O)O[C@@H]7[C@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O)OC(=O)CCCCCCCC(C)CCCCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20516',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of pyridopyrimidines that is an mTOR inhibitor and shows anti-tumour properties. It has a role as a mTOR inhibitor and an antineoplastic agent. It is a member of morpholines, a pyridopyrimidine, a monomethoxybenzene, a tertiary amino compound and a member of benzyl alcohols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1CN(C[C@@H](O1)C)C2=NC3=C(C=CC(=N3)C4=CC(=C(C=C4)OC)CO)C(=N2)N5CCOCC5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21825',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an organosulfur heterocyclic compound produced by a marine bacterium Alteromonas rava and has been shown to exhibit antibacterial activity against Gram-positive and Gram-negative bacteria. It has a role as an antibacterial agent, an antimicrobial agent and a bacterial metabolite. It is an enoate ester, a lactam and an organosulfur heterocyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](/C=C/C[C@H]1CO[C@H]([C@@H]([C@@H]1O)O)C/C(=C/C(=O)OCCCCCCCC(=O)NC2=C3C(=CSS3)NC2=O)/C)[C@H](C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4196',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a disaccharide derivative consisting of D-ribitol-5-phosphate having an alpha-L-rhamnosyl residue attached at the 4-position. It is an alditol 5-phosphate and a disaccharide derivative. It derives from a ribitol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@H](COP(=O)(O)O)[C@H]([C@H](CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4306',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a retinoid that consists of all-trans-retinol having a hydroxy substituent at position 4 on the cyclohexenyl ring. It has a role as a human xenobiotic metabolite and a mouse metabolite. It is a retinoid and a diol. It derives from an all-trans-retinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C(CCC1O)(C)C)/C=C/C(=C/C=C/C(=C/CO)/C)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20558',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organosilicon compound that is docosylsilane in which the hydrogens attached to the Si atom have been replaced by chloro groups. Metabolite observed in cancer metabolism. It has a role as a human metabolite. It is an organosilicon compound and a chlorine molecular entity.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCC[Si](Cl)(Cl)Cl\\nThe natural language question is: The molecule is an omega-hydroxy fatty acid comprising heptanoic acid which is substituted by a hydroxy group at position 7. It is an omega-hydroxy fatty acid, a medium-chain fatty acid and a straight-chain fatty acid. It derives from a heptanoic acid.\\nThe corresponding SMILES representation is:\\nC(CCCO)CCC(=O)O\\nThe natural language question is: The molecule is a quinolinium ion obtained by protonation of the quinoline nitrogen of lenvatinib. It is a conjugate acid of a lenvatinib.\\nThe corresponding SMILES representation is:\\nCOC1=CC2=[NH+]C=CC(=C2C=C1C(=O)N)OC3=CC(=C(C=C3)NC(=O)NC4CC4)Cl\\nThe natural language question is: The molecule is a 2-oxo monocarboxylic acid anion that is the conjugate base of 4-methyl-2-oxopentanoic acid. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It derives from a valerate. It is a conjugate base of a 4-methyl-2-oxopentanoic acid.\\nThe corresponding SMILES representation is:\\nCC(C)CC(=O)C(=O)[O-]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a butyrate ester obtained by the formal condensation of butyric acid with butan-2-ol. It has a role as a metabolite. It derives from a butan-2-ol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCC(=O)OC(C)CC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15130',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a 1-phosphatidyl-1D-myo-inositol 3-phosphate in which both phosphatidyl acyl groups are specified as octanoyl. It is a 1-phosphatidyl-1D-myo-inositol 3-phosphate and an octanoate ester. It is a conjugate acid of a 1,2-dioctanoyl-sn-glycero-3-phospho-(1'-D-myo-inositol-3'-phosphate)(3-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCCCCCCC(=O)OC[C@H](COP(=O)(O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H]([C@H]1O)OP(=O)(O)O)O)O)O)OC(=O)CCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28329',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a member of the class of quinolines that is quinoline substituted by a methyl group, 3-fluoro-2-(2-hydroxypropan-2-yl)phenoxy group, fluoro group and fluoro group at positions 2, 3, 7 and 8, respectively. It is a fungicide being developed by Nippon-Soda Co. Ltd (Japan) which has stable efficacy against a wide range of plant diseases, such as gray mold, scab and rice blast. It has a role as a fungicide. It is a member of quinolines, an organofluorine compound, an aromatic ether and a member of benzyl alcohols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C=C2C=CC(=C(C2=N1)F)F)OC3=C(C(=CC=C3)F)C(C)(C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17115',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a limonoid isolated from Azadirachta indica. It has a role as an antineoplastic agent and a plant metabolite. It is an organic heteropentacyclic compound, a cyclic terpene ketone, an enone, a member of furans, a limonoid and a methyl ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C2[C@@H](C[C@H]1C3=COC=C3)O[C@H]4[C@@]2([C@@H]([C@@]5([C@H]6[C@H]4OC[C@@]6(C=CC5=O)C)C)CC(=O)OC)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11216',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside derived from (12R)-12-hydroxytridecanoic acid. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside, a monocarboxylic acid and an (omega-1)-hydroxy fatty acid ascaroside. It derives from an ascr#22 and a (12R)-12-hydroxytridecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCCCCCCC(=O)O)O)OC(=O)C2=CNC3=CC=CC=C32'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12219',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is the ammonium ion resulting from the protonation of the amino group of scopolamine. It is a conjugate acid of a scopolamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[NH+]1[C@@H]2CC(C[C@H]1[C@H]3[C@@H]2O3)OC(=O)[C@H](CO)C4=CC=CC=C4'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_363',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an optically active form of O-isobutyrylcarnitine having (R)-configuration. It has a role as a human metabolite. It is an O-isobutyrylcarnitine, a methyl-branched fatty acyl-L-carnitine and a saturated fatty acyl-L-carnitine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C(=O)O[C@H](CC(=O)[O-])C[N+](C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17377',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a precorrin carboxylic acid anion obtained by deprotonation of the carboxy groups of cobalt-precorrin-4; major species at pH 7.3. It is a conjugate base of a cobalt-precorrin-4.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1C23C4=C([C@@](C(=N4)CC5=C(C(=C([N-]5)/C=C\\\\\\\\6/[C@H]([C@](C(=N6)/C=C(\\\\\\\\[N-]2)/[C@H]([C@@]3(CC(=O)O1)C)CCC(=O)[O-])(C)CC(=O)[O-])CCC(=O)[O-])CC(=O)[O-])CCC(=O)[O-])(C)CCC(=O)[O-])CC(=O)[O-].[Co]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2111',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an alpha,omega-dicarboxylic acid that is succinic acid substituted by two methyl groups at positions 2 and 2 respectively. It derives from a succinic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(CC(=O)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27234',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pyridopyrimidine that is 9-methyl-2,4-dioxo-2H-pyrido[1,2-a]pyrimidine substituted at positions 1 and 3 by (2-chloro-1,3-thiazol-5-yl)methyl and 3,5-dichlorophenyl. A mesionic insecticide used for control of rice hoppers. It has a role as an agrochemical. It is an iminium betaine, an organochlorine insecticide, a dichlorobenzene, a member of 1,3-thiazoles and a pyridopyrimidine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC=CN2C1=[N+](C(=C(C2=O)C3=CC(=CC(=C3)Cl)Cl)[O-])CC4=CN=C(S4)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20358',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is the anion obtained from global deprotonation of the phosphate OH groups of myo-inositol 1,3,4,5,6-pentakisphosphate; major species at pH 7.3. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a myo-inositol 1,3,4,5,6-pentakisphosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@H]1([C@H](C([C@H]([C@@H](C1O)OP(=O)([O-])[O-])OP(=O)([O-])[O-])OP(=O)([O-])[O-])OP(=O)([O-])[O-])OP(=O)([O-])[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26716',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (10E,12Z)-hexadecadienoic acid. It is a long-chain fatty acyl-CoA and an unsaturated fatty acyl-CoA. It is a conjugate acid of a (10E,12Z)-hexadecadienoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCC/C=C\\\\\\\\C=C/CCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12304',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a triterpenoid saponin with hederagenin as the sapogenin. It has been isolated from the stem bark of Kalopanax pictus. It has a role as an anti-inflammatory agent and a plant metabolite. It is a triterpenoid saponin, a pentacyclic triterpenoid, an acetate ester and a primary alcohol. It derives from a hederagenin. It derives from a hydride of an oleanane.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H](O[C@H]([C@@H]([C@H]2O)O)OC[C@@H]3[C@H]([C@@H]([C@H]([C@@H](O3)OC(=O)[C@@]45CC[C@@]6(C(=CC[C@H]7[C@]6(CC[C@@H]8[C@@]7(CC[C@@H]([C@@]8(C)CO)O[C@H]9[C@@H]([C@H]([C@H](CO9)O)OC(=O)C)OC(=O)C)C)C)[C@@H]4CC(CC5)(C)C)C)O)O)O)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3742',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an N,N-dihydroxy-alpha-amino acid having a 8-thianonyl substituent at the 2-position. It derives from a pentahomomethionine. It is a conjugate acid of a N,N-dihydroxypentahomomethioninate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CSCCCCCCCC(C(=O)O)N(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22474',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a disaccharide derivative consisting of two 3-deoxy-alpha-D-manno-oct-2-ulopyranonosyl units joined via an alpha-(2->8)-linkage with an O-allyl group at the anomeric centre and with the carboxy group of the residue at the non-reducing end methyl-esterified. It is a disaccharide derivative, a glycoside, a methyl ester and a dicarboxylic acid monoester. It is a conjugate acid of an alpha-Kdo1Me-(2->8)-alpha-Kdo-OAll(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC(=O)[C@]1(C[C@H]([C@H]([C@H](O1)[C@@H](CO)O)O)O)OC[C@H]([C@@H]2[C@@H]([C@@H](C[C@@](O2)(C(=O)O)OCC=C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27879',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a tertiary amino compound that is propylamine which is substituted at position 3 by a pyridin-2-yl group and a p-chlorophenyl group and in which the hydrogens attached to the nitrogen are replaced by methyl groups. A histamine H1 antagonist, it is used to relieve the symptoms of hay fever, rhinitis, urticaria, and asthma. It has a role as a H1-receptor antagonist, an antipruritic drug, a histamine antagonist, a serotonin uptake inhibitor, an antidepressant and an anti-allergic agent. It is a tertiary amino compound, a member of monochlorobenzenes and a member of pyridines.\\nThe corresponding SMILES representation is:\\nCN(C)CCC(C1=CC=C(C=C1)Cl)C2=CC=CC=N2\\nThe natural language question is: The molecule is a sulfonamide in which the nitrogen of 4-sulfamoylbenzoic acid is substituted with two propyl groups. It has a role as a uricosuric drug. It is a sulfonamide and a member of benzoic acids.\\nThe corresponding SMILES representation is:\\nCCCN(CCC)S(=O)(=O)C1=CC=C(C=C1)C(=O)O\\nThe natural language question is: The molecule is the conjugate base of 3-(2,3-dihydroxyphenyl)propanoic acid. It derives from a propionate. It is a conjugate base of a 3-(2,3-dihydroxyphenyl)propanoic acid.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C(=C1)O)O)CCC(=O)[O-]\\nThe natural language question is: The molecule is an organosilicon compound that is phenol substituted by a trimethylsilyl group at position 4. It is a member of phenols and an organosilicon compound.\\nThe corresponding SMILES representation is:\\nC[Si](C)(C)C1=CC=C(C=C1)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside that is icas#20 in which the pro-R hydrogen beta to the carboxy group is replaced by a hydroxy group. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is an (omega-1)-hydroxy fatty acid ascaroside, a 3-hydroxy carboxylic acid, a 4-O-(1H-indol-3-ylcarbonyl)ascaroside and a monocarboxylic acid. It derives from a bhas#20, an icas#20 and a (3R,11R)-3,11-dihydroxylauric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCCC[C@H](CC(=O)O)O)O)OC(=O)C2=CNC3=CC=CC=C32'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25828',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is the open chain form of D-glucose. It is a D-glucose and an aldehydo-glucose. It is an enantiomer of an aldehydo-L-glucose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@H]([C@@H]([C@H](C=O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16082',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an ammonium ion that is the conjugate acid of L-histidinol arising from protonation of the primary amino function; major species at pH 7.3. It has a role as a Saccharomyces cerevisiae metabolite. It is a conjugate acid of a L-histidinol.\\nThe corresponding SMILES representation is:\\nC1=C(NC=N1)C[C@@H](CO)[NH3+]\\nThe natural language question is: The molecule is a steroid sulfate oxoanion that is the conjugate base of (3alpha,5alpha,17beta)-3-hydroxyandrostan-17-yl sulfate, obtained by deprotonation of the sulfo group; major species at pH 7.3. It is a conjugate base of a (3alpha,5alpha,17beta)-3-hydroxyandrostan-17-yl sulfate.\\nThe corresponding SMILES representation is:\\nC[C@]12CC[C@H](C[C@@H]1CC[C@@H]3[C@@H]2CC[C@]4([C@H]3CC[C@@H]4OS(=O)(=O)[O-])C)O\\nThe natural language question is: The molecule is a carbohydrate sulfonate that is 3-deoxy-D-erythro-hex-2-ulosonic acid in which the hydroxy group at position 6 is replaced by a sulfo group. It has a role as a bacterial xenobiotic metabolite. It is a carbohydrate acid derivative and a carbohydrate sulfonate. It is a conjugate acid of a 2-dehydro-3,6-dideoxy-6-sulfo-D-gluconate(2-).\\nThe corresponding SMILES representation is:\\nC([C@@H]([C@@H](CS(=O)(=O)O)O)O)C(=O)C(=O)O\\nThe natural language question is: The molecule is an aminoglycan consisting of beta-(1->4)-linked N-acetyl-D-glucosamine residues. It has a role as a vulnerary, a human metabolite, a Saccharomyces cerevisiae metabolite and a mouse metabolite. It is an aminoglycan and a N-acylglucosamine.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a hydrocarbyl anion. It is a conjugate base of a methane. It is a conjugate acid of a methanediide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[CH3-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1699',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a lactam that is 4-hydroxyazacyclotetradecan-2-one substituted by an ethyl group at position 11, methyl groups at position 3 and 7 and a 3-amino-3,6-dideoxy-alpha-L-talopyranosyl moiety at position 4 via a glycosyl linkage (the 3R,4S,7R,11S stereoisomer). It is isolated from the fermentation broth of an unidentified actinomycete species and exhibits potent inhibitory activity against influenza A virus. It has a role as an antimicrobial agent, an EC 3.2.1.18 (exo-alpha-sialidase) inhibitor and a metabolite. It is an aminoglycoside, a lactam and a macrocycle.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H]1CCC[C@H](CC[C@@H]([C@H](C(=O)NCCC1)C)O[C@H]2[C@@H]([C@@H]([C@@H]([C@@H](O2)C)O)N)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_555',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a linear amino tetrasaccharide comprising two alpha-N-glycoloylneuraminyl residues, a beta-D-galactose residue and (at the reducing end) an N-acetyl-D-glucosamine residue, linked sequentially (2->8), (2->3) and (1->4). It is a glucosamine oligosaccharide and an amino tetrasaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](OC1O)CO)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O[C@@]3(C[C@@H]([C@H]([C@@H](O3)[C@@H]([C@@H](CO)O[C@@]4(C[C@@H]([C@H]([C@@H](O4)[C@@H]([C@@H](CO)O)O)NC(=O)CO)O)C(=O)O)O)NC(=O)CO)O)C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21557',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a branched amino tetrasaccharide consisting of a beta-D-galactose residue (1->3)-linked to the reducing-end GalNAc residue of a linear chain of beta-D-galactose, N-acetyl-beta-D-glucosamine and N-acetyl-D-galactosamine residues linked (1->4) and (1->6) respectively. It has a role as an epitope. It is an amino tetrasaccharide, a glucosamine oligosaccharide and a galactosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1OC[C@@H]2[C@@H]([C@@H]([C@H](C(O2)O)NC(=O)C)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)CO)O[C@H]4[C@@H]([C@H]([C@H]([C@H](O4)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21482',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion arising from deprotonation of the carboxy and phosphate OH groups of 7-phospho-2-dehydro-3-deoxy-D-arabino-heptonic acid; major species at pH 7.3. It has a role as a Saccharomyces cerevisiae metabolite. It is an organophosphate oxoanion, a carbohydrate acid derivative anion and a monocarboxylic acid anion. It is a conjugate base of a 7-phospho-2-dehydro-3-deoxy-D-arabino-heptonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@@H]([C@@H](COP(=O)([O-])[O-])O)O)O)C(=O)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_230',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organic cation obtained by protonation of the secondary amino function of methamphetamine. It is a conjugate acid of a methamphetamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](CC1=CC=CC=C1)[NH2+]C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18797',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an aldehyde that is acetaldehyde in which one of the methyl hydrogens has been replaced by a methylsulfanyl group. It is an aldehyde and a methyl sulfide. It derives from an acetaldehyde.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CSCC=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13917',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a dolichyl diphosphooligosaccharide in which the oligosaccharide moiety is the Man6GlcNAc2 branched octasaccharide alpha-D-Man-(1->2)-alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->3)-alpha-D-Man-(1->6)]-beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc. It is a conjugate acid of an alpha-D-Man-(1->2)-alpha-D-Man-(1->2)-alpha-D-Man-(1->3)-[alpha-D-Man-(1->3)-alpha-D-Man-(1->6)]-beta-D-Man-(1->4)-beta-D-GlcNAc-(1->4)-D-GlcNAc(PP-Dol)(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CC/C=C(/C)\\\\\\\\CC/C=C(\\\\\\\\C)/CC/C=C(\\\\\\\\C)/CCC=C(C)C)CCOP(=O)(O)OP(=O)(O)OC1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@H]([C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO)O)O[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O)O)O)O[C@@H]6[C@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O[C@@H]7[C@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O[C@@H]8[C@H]([C@H]([C@@H]([C@H](O8)CO)O)O)O)O)O)NC(=O)C)O)NC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10624',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a 2'-deoxyribonucleoside triphosphate oxoanion that is a trianion of 2'-deoxycytidine 5'-triphosphate, arising from deprotonation of three of the four triphosphate OH groups. It has a role as a Saccharomyces cerevisiae metabolite. It is a conjugate base of a dCTP. It is a conjugate acid of a dCTP(4-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1[C@@H]([C@H](O[C@H]1N2C=CC(=NC2=O)N)COP(=O)([O-])OP(=O)([O-])OP(=O)(O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12034',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a primary alcohol that is ethanol substituted by a phenyl group at position 2. It has a role as a fragrance, a Saccharomyces cerevisiae metabolite, a plant metabolite, an Aspergillus metabolite and a plant growth retardant. It is a primary alcohol and a member of benzenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)CCO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14300',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an optically active form of dopa having L-configuration. Used to treat the stiffness, tremors, spasms, and poor muscle control of Parkinson's disease It has a role as a prodrug, a hapten, a neurotoxin, an antiparkinson drug, a dopaminergic agent, an antidyskinesia agent, an allelochemical, a plant growth retardant, a human metabolite, a mouse metabolite and a plant metabolite. It is a dopa, a L-tyrosine derivative and a non-proteinogenic L-alpha-amino acid. It is a conjugate acid of a L-dopa(1-). It is an enantiomer of a D-dopa. It is a tautomer of a L-dopa zwitterion.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C=C1C[C@@H](C(=O)O)N)O)O\\nThe natural language question is: The molecule is a peptide zwitterion obtained by transfer of a proton from the carboxy to the amino terminus of Ala-Thr. It is a tautomer of an Ala-Thr.\\nThe corresponding SMILES representation is:\\nC[C@H]([C@@H](C(=O)[O-])NC(=O)[C@H](C)[NH3+])O\\nThe natural language question is: The molecule is the conjugate base of L-gamma-glutamyl-L-cysteinyl-beta-alanine having an anionic carboxy terminus and a zwitterionic gamma-glutamyl residue; major species at pH 7.3. It is a conjugate base of a L-gamma-glutamyl-L-cysteinyl-beta-alanine.\\nThe corresponding SMILES representation is:\\nC(CC(=O)N[C@@H](CS)C(=O)NCCC(=O)[O-])[C@@H](C(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is a pyridine nucleoside consisting of 1,4-dihydronicotinamide with a beta-D-ribofuranosyl moiety at the 1-position. It is a dihydropyridine and a pyridine nucleoside.\\nThe corresponding SMILES representation is:\\nC1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)CO)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is the simplest of the sulfamic acids consisting of a single sulfur atom covalently bound by single bonds to two amino groups and by double bonds to two oxygen atoms.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'NS(=O)(=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2671',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of quinolines and an organic iodide salt. It has a role as a fluorochrome. It contains a cryptocyanin cation.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN1C=C/C(=C\\\\\\\\C=C\\\\\\\\C2=CC=[N+](C3=CC=CC=C23)CC)/C4=CC=CC=C41.[I-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19603',\n", + " 'prompt': \"Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an acetate ester obtained by formal condensation between the 3-hydroxy group of (+)-taxifolin and acetic acid. It is an acetate ester, a member of 3'-hydroxyflavanones, a tetrahydroxyflavanone and a member of 4'-hydroxyflavanones. It derives from a (+)-taxifolin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC(=O)O[C@@H]1[C@H](OC2=CC(=CC(=C2C1=O)O)O)C3=CC(=C(C=C3)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29623',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a C-nitro compound that is benzaldehyde substituted at the para-position with a nitro group. It is a C-nitro compound and a member of benzaldehydes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C=O)[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2213',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a diarylheptanoid that is 3-heptanone substituted by a 4-hydroxy-3-methoxyphenyl group at position 1 and a 4-hydroxyphenyl group at position 7. It has been isolated from the rhizomes of Curcuma kwangsiensis. It has a role as a plant metabolite. It is a diarylheptanoid, a ketone and a member of guaiacols.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)CCC(=O)CCCCC2=CC=C(C=C2)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11947',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydroxybenzaldehyde that is 4-hydroxybenzaldehyde bearing an additional amino substituent at position 3. It has a role as a bacterial metabolite. It is a hydroxybenzaldehyde and a substituted aniline.\\nThe corresponding SMILES representation is:\\nC1=CC(=C(C=C1C=O)N)O\\nThe natural language question is: The molecule is the iminium ion derived from acid green 5 (the disodium salt). It is a conjugate acid of an acid green 5(2-).\\nThe corresponding SMILES representation is:\\nCCN(CC1=CC(=CC=C1)S(=O)(=O)O)C2=CC=C(C=C2)C(=C3C=CC(=[N+](CC)CC4=CC(=CC=C4)S(=O)(=O)O)C=C3)C5=CC=C(C=C5)S(=O)(=O)O\\nThe natural language question is: The molecule is a glycophytoceramide having an alpha-D-galactopyranosyl residue at the O-1 position and a decanoyl group attached to the nitrogen. It derives from an alpha-D-galactose and a decanoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC[C@H]([C@H]([C@H](CO[C@@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)NC(=O)CCCCCCCCC)O)O\\nThe natural language question is: The molecule is an unsaturated fatty acid anion that is the conjugate base of (8E,10S)-10-hydroxy-8-octadecenoic acid, obtained by deprotonation of the carboxy group. It is a hydroxy fatty acid anion, a long-chain fatty acid anion and an unsaturated fatty acid anion. It is a conjugate base of an (8E,10S)-10-hydroxy-8-octadecenoic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCC[C@@H](/C=C/CCCCCCC(=O)[O-])O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of benzimidazoles carrying a 1,3-thiazol-4-yl substituent at position 2. A mainly post-harvest fungicide used to control a wide range of diseases including Aspergillus, Botrytis, Cladosporium and Fusarium. It has a role as an antifungal agrochemical and an antinematodal drug. It is a member of benzimidazoles, a member of 1,3-thiazoles and a benzimidazole fungicide. It derives from a hydride of a 1H-benzimidazole.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C(=C1)NC(=N2)C3=CSC=N3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19243',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an organophosphate oxoanion obtained by deprotonation of the phosphate OH groups of D-ribulose 5-phosphate. Major structute at pH 7.3. It is a conjugate base of a D-ribulose 1-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@H]([C@H](C(=O)COP(=O)([O-])[O-])O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6992',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the fluorescent compound widely used in experimental cell biology and biochemistry to reveal double-stranded DNA and RNA. It has a role as an intercalator and a fluorochrome. It derives from a hydride of a phenanthridine.\\nThe corresponding SMILES representation is:\\nCC[N+]1=C2C=C(C=CC2=C3C=CC(=CC3=C1C4=CC=CC=C4)N)N\\nThe natural language question is: The molecule is an aryl sulfide that is (2Z,3Z)-bis[amino(sulfanyl)methylidene]butanedinitrile in which the sulfanyl hydrogens are replaced by 2-aminophenyl groups. An inhibitor of mitogen-activated protein kinase that also exhibits anti-cancer properties. It has a role as an EC 2.7.11.24 (mitogen-activated protein kinase) inhibitor, an apoptosis inducer, an antineoplastic agent, an antioxidant, an osteogenesis regulator and a vasoconstrictor agent. It is an enamine, an aryl sulfide, a substituted aniline and a dinitrile.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C(=C1)N)S/C(=C(/C(=C(/SC2=CC=CC=C2N)\\\\\\\\N)/C#N)\\\\\\\\C#N)/N\\nThe natural language question is: The molecule is the conjugate base of alpha-mycolic acid type-1 (II). A class of mycolic acids characterized by the presence of a proximal trans and a distal cis cyclopropyl group in the meromycolic chain.\\nThe corresponding SMILES representation is:\\nCCC1CC1CC(C)C2CC2C[C@H]([C@@H](CC)C(=O)[O-])O\\nThe natural language question is: The molecule is a member of the cadinene family of sesquiterpenes in which the isopropyl group is cis to the hydrogen at the adjacent bridgehead carbon (the 1S,4aR,8aR enantiomer). It is a cadinene and a member of octahydronaphthalenes. It is an enantiomer of a (-)-gamma-cadinene.\\nThe corresponding SMILES representation is:\\nCC1=C[C@@H]2[C@@H](CC1)C(=C)CC[C@H]2C(C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a fifteen-membered glycopeptide comprising glycyl, 3-(1,3-thiazol-4-yl)alanyl, alanyl, glycyl, 4-pyridylalanyl, (5R)-5-(beta-D-galactopyranosyloxy)lysyl, glycyl. alpha-glutamyl, glutaminyl, glycyl, prolyl, lysyl, glycyl, alpha-glutamyl and threonine residues coupled in sequence.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@@H](C(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@@H]1CCCN1C(=O)CNC(=O)[C@H](CCC(=O)N)NC(=O)[C@H](CCC(=O)O)NC(=O)CNC(=O)[C@H](CC[C@H](CN)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)NC(=O)[C@H](CC3=CC=NC=C3)NC(=O)CNC(=O)[C@H](C)NC(=O)[C@H](CC4=CSC=N4)NC(=O)CN)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11875',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a steroidal acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 3-hydroxy-9-oxo-9,10-seco-23,24-bisnorchola-1,3,5(10)-trien-22-oic acid. It derives from a 3-hydroxy-9-oxo-9,10-seco-23,24-bisnorchola-1,3,5(10)-trien-22-oic acid. It is a conjugate acid of a 3-hydroxy-9-oxo-9,10-seco-23,24-bisnorchola-1,3,5(10)-trien-22-oyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C(C=C(C=C1)O)CC[C@H]2[C@@H]3CC[C@@H]([C@]3(CCC2=O)C)[C@H](C)C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]4[C@H]([C@H]([C@@H](O4)N5C=NC6=C(N=CN=C65)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21619',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an amino tetrasaccharide comprising an alpha-sialyl residue, two N-acetyl-beta-D-glucosaminyl residues and an N-acetyl-D-glucosamine residue linked in a (2->6), (1->4) and (1->4) sequence. It has a role as an epitope. It is an amino tetrasaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H](C[C@@](O[C@H]1[C@@H]([C@@H](CO)O)O)(C(=O)O)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O[C@@H]3[C@H](O[C@H]([C@@H]([C@H]3O)NC(=O)C)O[C@@H]4[C@H](OC([C@@H]([C@H]4O)NC(=O)C)O)CO)CO)NC(=O)C)O)O)O\\nThe natural language question is: The molecule is a germacranolide isolated from Lychnophora antillana and has been shown to exhibit antineoplastic activity. It has a role as a metabolite and an antineoplastic agent. It is a germacranolide, an acetate ester, a cyclic ketone and a secondary alcohol.\\nThe corresponding SMILES representation is:\\nC[C@H]1CCC(=O)[C@](C[C@@H]([C@@H]2[C@@H]([C@H]1O)OC(=O)C2=C)OC(=O)C(=C)C)(C)OC(=O)C\\nThe natural language question is: The molecule is an arsonium ion consisting of four phenyl groups attached to a central arsonium. It derives from a hydride of an arsonium.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)[As+](C2=CC=CC=C2)(C3=CC=CC=C3)C4=CC=CC=C4\\nThe natural language question is: The molecule is a polyprenyl glycosyl diphosphate having eleven prenyl units and with beta-D-glucosyl-(1->4)-N-acetyl-D-glucosamine as the glycosyl fragment. It is a conjugate acid of a beta-D-glucosyl-(1->4)-N-acetyl-D-glucosaminyl undecaprenyl diphosphate(2-).\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)(O)OP(=O)(O)OC1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a steroid glucuronide anion that is the conjugate base of 2-methoxyestrone 3-O-(beta-D-glucuronide) arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a steroid glucosiduronic acid anion, a beta-D-glucosiduronate and a monocarboxylic acid anion. It is a conjugate base of a 2-methoxyestrone 3-O-(beta-D-glucuronide).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@H]3[C@H]([C@@H]1CCC2=O)CCC4=CC(=C(C=C34)OC)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)[O-])O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14607',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a triacyl-sn-glycerol in which the acyl groups at positions 1 and 2 are specified as oleoyl while that at position 3 is specified as (11Z)-icosenoyl. It has a role as a human blood serum metabolite. It is a triacylglycerol 56:3 and a triacyl-sn-glycerol.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCCCC(=O)OC[C@@H](COC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC\\nThe natural language question is: The molecule is an O-hexanoylcarnitine that has L configuration. It has a role as a human metabolite. It is an O-hexanoylcarnitine and a saturated fatty acyl-L-carnitine.\\nThe corresponding SMILES representation is:\\nCCCCCC(=O)O[C@H](CC(=O)[O-])C[N+](C)(C)C\\nThe natural language question is: The molecule is a C-glycosyl compound that is 1,8-dihydroxy-3-methylanthracen-9(10H)-one substituted by a 1-O-acetyl-3-O-senecioyl-alpha-L-lyxopyranosyl moiety at position 10 via a C-glycosidic linkage (the 10S stereoisomer). It is isolated from the leaves of Alvaradoa haitiensis and exhibits cytotoxicity against human oral epidermoid carcinoma. It has a role as a metabolite and an antineoplastic agent. It is a member of anthracenes, a C-glycosyl compound, an acetate ester and a polyphenol. It derives from a 3-methylbut-2-enoic acid.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C(=C1)O)C(=O)C3=C([C@@H]2[C@H]4[C@@H]([C@H]([C@H]([C@@H](O4)OC(=O)C)O)OC(=O)C=C(C)C)O)C=CC=C3O\\nThe natural language question is: The molecule is the conjugate acid of (S)-6-hydroxynicotine; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a (S)-6-hydroxynicotine. It is an enantiomer of a (R)-6-hydroxynicotinium.\\nThe corresponding SMILES representation is:\\nC[NH+]1CCC[C@H]1C2=CNC(=O)C=C2\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 2-pyranone in which the hydrogens at positions 4 and 6 of 2H-pyran-2-one are replaced by hydroxy and 2-oxotridecyl groups respectively. It is a member of 2-pyranones, a ketone and a heteroaryl hydroxy compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCC(=O)CC1=CC(=CC(=O)O1)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27913',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a polycyclic cage that is the methyl ester derivative of 14-hydroxyplatensic acid. It is isolated from Streptomyces platensis. It has a role as a metabolite. It is a cyclic ether, a cyclic ketone, a polycyclic cage, a secondary alcohol and a methyl ester. It derives from a 14-hydroxyplatensic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]1([C@@H]2[C@@H]3C[C@@H]4C[C@@]2(C=CC1=O)[C@H]([C@@]4(O3)C)O)CCC(=O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25033',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a purine derivative that consists of 6-aminohexanoic acid having a purin-6-oyl group attached to the amino function via an amide bond. It is a member of purines and a monocarboxylic acid amide. It derives from a 6-aminohexanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC2=NC=NC(=C2N1)C(=O)NCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28291',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a member of the class of imidazolines that is 4,5-dihydro-1H-imidazole which is substituted by a 1H-indole-3-carbonyl group at position 2 and a 6-bromo-1H-indol-3-yl group at position 4S. It is a bisindole alkaloid isolated from the marine sponge, Spongosorites sp. It has a role as an antineoplastic agent and a marine metabolite. It is a bromoindole, a bisindole alkaloid, a member of imidazolines and an aromatic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@@H](NC(=N1)C(=O)C2=CNC3=CC=CC=C32)C4=CNC5=C4C=CC(=C5)Br'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19776',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a member of the class of cleistanthins that is cleistanthin A in which the 3,4-di-O-methyl-D-xylopyranosyl group is replaced by a beta-D-glucopyranosyl group. It has a role as an antihypertensive agent, an alpha-adrenergic antagonist and a diuretic. It is a member of cleistanthins, a beta-D-glucoside and a monosaccharide derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=C2C(=C1)C(=C3C(=C2O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)COC3=O)C5=CC6=C(C=C5)OCO6)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12169',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a hydroxyflavan that is (2S)-flavan substituted by a hydroxy group at position 4. It derives from a hydride of a (2S)-flavan.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H](OC2=CC=CC=C2C1O)C3=CC=CC=C3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27399',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a steroid glucuronide anion that is the conjugate base of 5alpha-dihydrotestosterone 17-O-(beta-D-glucuronide) arising from deprotonation of the carboxylic acid function; major species at pH 7.3. It is a steroid glucosiduronic acid anion, a beta-D-glucosiduronate and a monocarboxylic acid anion. It is a conjugate base of a 5alpha-dihydrotestosterone 17-O-(beta-D-glucuronide).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CCC(=O)C[C@@H]1CC[C@@H]3[C@@H]2CC[C@]4([C@H]3CC[C@@H]4O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)[O-])O)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25396',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a naphtho-gamma-pyrone that is 1H-benzo[g]isochromen-1-one substituted by a methyl group at position 3 and hydroxy groups at positions 7, 9 and 10. It has a role as a fungal metabolite. It is a heptaketide, a member of phenols, a lactone and a naphtho-alpha-pyrone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC2=CC3=CC(=CC(=C3C(=C2C(=O)O1)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6827',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an olefinic compound that is hept-5-en-1-ol substituted by methyl groups at positions 2 and 6 respectively. It has a role as a metabolite. It is a primary alcohol and an olefinic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCC=C(C)C)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17258',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a mnocarboxylic acid anion that is the conjugate base of 13-[(9Z)-hexadecenoyloxy]octadecanoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a 13-[(9Z)-hexadecenoyloxy]octadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC(CCCCC)CCCCCCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23968',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a capric acid derivative carrying an oxo group at position 10. It is an oxo monocarboxylic acid, an aldehydic acid, a medium-chain fatty acid and an omega-oxo fatty acid. It derives from a decanoic acid. It is a conjugate acid of a 10-oxocaprate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCCCC(=O)O)CCCC=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24707',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a HPETE in which the hydroperoxy group is located at position 8S and the four double bonds at position 5, 9, 11 and 14 (the 5Z,9E,11Z,14Z-geoisomer). It has a role as a mouse metabolite. It derives from an icosa-5,9,11,14-tetraenoic acid. It is a conjugate acid of an 8(S)-HPETE(1-). It is an enantiomer of an 8(R)-HPETE.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\[C@H](C/C=C\\\\\\\\CCCC(=O)O)OO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6368',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is dianion of ADP-D-glycero-beta-D-manno-heptose arising from deprotonation of both free OH groups of the diphosphate. It is a conjugate base of an ADP-D-glycero-beta-D-manno-heptose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])O[C@H]4[C@H]([C@H]([C@@H]([C@H](O4)[C@@H](CO)O)O)O)O)O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26052',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an amino-acid zwitterion arising form transfer of a proton from the carboxy to the amino group of 3-amino-3-phenylpropanoic acid; major species at pH 7.3. It is a tautomer of a 3-amino-3-phenylpropanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C(CC(=O)[O-])[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15129',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an organic anion that is the conjugate base of aspulvinone E, arising from selective deprotonation of the butenolide OH group; major species at pH 7.3. It is a conjugate base of an aspulvinone E.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C\\\\\\\\2/C(=C(C(=O)O2)C3=CC=C(C=C3)O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2823',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a pyridinedicarboxylate and a quinolinate. It has a role as a human metabolite, a mouse metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a quinolinate(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(N=C1)C(=O)[O-])C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7249',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an aminoimidazole that is 5-amino-1H-imidazole which is substituted at positions 1 and 4 by aminocarbonyl and cyano groups, respectively. It is an aminoimidazole, a nitrile and a primary amino compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=NC(=C(N1C(=O)N)N)C#N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26462',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an anionic ceramide phosphoinositol compound having a tetracosanoyl group attached to the ceramide nitrogen, no hydroxylation at C-4 of the long-chain base, and hydroxylation at C-2 of the very-long-chain fatty acid. Major species at pH 7.3. It is a conjugate base of an Ins-1-P-Cer(d18:0/2-OH-24:0).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCC[C@@H](C(=O)N[C@@H](COP(=O)([O-])OC1[C@@H]([C@H](C([C@H]([C@H]1O)O)O)O)O)[C@@H](CCCCCCCCCCCCCCC)O)O\\nThe natural language question is: The molecule is a prostaglandin carboxylic acid anion that is the conjugate base of prostaglandin H3, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a prostaglandin H3.\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C[C@@H](/C=C/[C@H]1[C@H]2C[C@@H]([C@@H]1C/C=C\\\\\\\\CCCC(=O)[O-])OO2)O\\nThe natural language question is: The molecule is a 1,2-diglyceride in which the acyl groups at positions 1 and 2 are specifed as tetradecanoyl. It is a 1,2-diglyceride and a tetradecanoate ester.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCCCC\\nThe natural language question is: The molecule is a cyanine dye and an organic potassium salt. It has a role as a fluorochrome. It contains a NIR-3(2-).\\nThe corresponding SMILES representation is:\\nCC1(C2=C(C=CC(=C2)C(=O)O)[N+](=C1/C=C/C=C/C=C/C=C\\\\\\\\3/C(C4=C(N3CCCCS(=O)(=O)[O-])C=CC(=C4)S(=O)(=O)[O-])(C)C)CCCCS(=O)(=O)[O-])C.[K+].[K+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a tetrapeptide composed of L-alanine, L-leucine, L-threonine, and L-proline units joined in sequence. It has a role as a metabolite. It derives from a L-alanine, a L-leucine, a L-threonine and a L-proline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]([C@@H](C(=O)N1CCC[C@H]1C(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](C)N)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7820',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a dihydroxy monocarboxylic acid that is dodecanoic acid (lauric acid) in which a hydrogen at position 2 and a hydrogen at position 3 have each been replaced by a hydroxy group. It is a dihydroxy monocarboxylic acid and a medium-chain fatty acid. It is a conjugate acid of a 2,3-dihydroxydodecanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCC(C(C(=O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17521',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an N-acyl-15-methylhexadecasphing-4-enine in which the acyl group has 19 carbons and 0 double bonds. It derives from a 15-methylhexadecasphing-4-enine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@@H](/C=C/CCCCCCCCCC(C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_291',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a dipeptide formed from two L-isoleucine residues. It has a role as a Mycoplasma genitalium metabolite and a human metabolite. It derives from a L-isoleucine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@@H](C(=O)N[C@@H]([C@@H](C)CC)C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3349',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a mannarate(2-) that is the dianion obtained by the deprotonation of the carboxy groups of L-mannaric acid. It is a conjugate base of a L-mannarate(1-). It is an enantiomer of a D-mannarate(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[C@@H]([C@H]([C@H](C(=O)[O-])O)O)([C@H](C(=O)[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27828',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an N-acyl-L-alpha-amino acid anion that is the conjugate base of N-acetyl-L-leucine: major species at pH 7.3. It is a conjugate base of a N-acetyl-L-leucine. It is an enantiomer of a N-acetyl-D-leucinate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C[C@@H](C(=O)[O-])NC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2689',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a gallate ester obtained by the formal condensation of gallic acid with ethanol. It has a role as a plant metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCOC(=O)C1=CC(=C(C(=C1)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13856',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a cyclitol phosphate that is validoxylamine A carrying a single monophosphate substituent at position 7'. It has a role as a bacterial metabolite. It is an amino cyclitol, a cyclitol phosphate and a secondary amino compound. It is a conjugate acid of a validoxylamine A 7'-phosphate(1-).\\nThe corresponding SMILES representation is:\\nC1[C@@H]([C@H]([C@@H]([C@H]([C@H]1N[C@H]2C=C([C@H]([C@@H]([C@H]2O)O)O)CO)O)O)O)COP(=O)(O)O\\nThe natural language question is: The molecule is a 17-oxosteroid that is 17-oxo-5beta,9beta,10alpha,13alpha-androsta-11,15-dieneandroxta-11,15-diene which is substituted by an acetoxy group at the 3beta position, methyl groups at the 4, 4, 8, 12, and 16 positions, a methoxycarbonyl group at position 14, and a hydroxy group at position 15. A farnesyltransferase inhibitor produced by Penicillium roqueforti, a filamentous fungus involved in the ripening of several kinds of blue cheeses. It has a role as an EC 2.5.1.58 (protein farnesyltransferase) inhibitor and a Penicillium metabolite. It is a 15-hydroxy steroid, a 17-oxo steroid, a 5beta steroid, an acetate ester, an enol, a meroterpenoid and a methyl ester. It is a conjugate acid of an andrastin C(1-).\\nThe corresponding SMILES representation is:\\nCC1=C[C@H]2[C@@]3(CC[C@@H](C([C@H]3CC[C@@]2([C@]4([C@@]1(C(=C(C4=O)C)O)C)C(=O)OC)C)(C)C)OC(=O)C)C\\nThe natural language question is: The molecule is an aminotrisaccharide consisting of beta-D-galactopyranose, 2-acetamido-2-deoxy-beta-D-glucopyranose and L-furopyranose residues joined in sequence by (1->4) and (1->3) glycosidic bonds. It is an amino trisaccharide and a member of acetamides. It derives from a N-acetyllactosamine.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@H]([C@H]([C@@H](C(O1)O)O)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)NC(=O)C)O\\nThe natural language question is: The molecule is an N-acylglycine in which the acyl group is specified as (2E)-3-phenylprop-2-enoyl (cinnamoyl). It has a role as a metabolite. It is a conjugate acid of a N-cinnamoylglycinate.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)/C=C/C(=O)NCC(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a member of the class of benzothiazines that is 2H-1,4-benzothiazine-3-carboxylic acid bearing additional hydroxy and 2-amino-2-carboxyethyl substituents at positions 5 and 7 respectively. It has a role as a human metabolite. It is a benzothiazine, a dicarboxylic acid, a member of phenols and a non-proteinogenic alpha-amino acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1C(=NC2=C(C=C(C=C2S1)CC(C(=O)O)N)O)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27656',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the simplest and least toxic member of the class of chloroethanes, that is ethane in which a single hydrogen is substituted by a chlorine. A colourless gas at room temperature and pressure (boiling point 12℃), it is used as a mild topical anaesthetic to numb the skin prior to ear piercing, skin biopsies, etc., and is also used in the treatment of sports injuries. It was formerly used in the production of tetraethyllead. It has a role as a local anaesthetic, an antipruritic drug and an inhalation anaesthetic.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18424',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is an organophosphate oxoanion arising from deprotonation of the phosphate OH groups of NMNH; major species at pH 7.3. It derives from a NMN(+). It is a conjugate base of a NMNH.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1C=CN(C=C1C(=O)N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3629',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a carboxylic acid anion obtained by deprotonation of the carboxy group of gibberellin A14 aldehyde. It is a conjugate base of a gibberellin A14 aldehyde.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12CC[C@@H]([C@@]([C@H]1[C@@H]([C@]34[C@H]2CC[C@H](C3)C(=C)C4)C=O)(C)C(=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3265',\n", + " 'prompt': \"Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an L-lysyl ester obtained by formal condensation of the carboxy group of L-lysine with the 3'-hydroxy group of AMP. It has a role as a Mycoplasma genitalium metabolite. It is an adenosine 5'-phosphate, a L-lysyl ester and a purine ribonucleoside 5'-monophosphate. It derives from an adenosine 5'-monophosphate.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)O)OC(=O)[C@H](CCCCN)N)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21968',\n", + " 'prompt': \"Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a ribonucleoside triphosphate oxoanion that is the hexaanion of guanosine 3'-diphosphate 5'-triphosphate; major species at pH 7.3. It is a conjugate base of a guanosine 3'-diphosphate 5'-triphosphate. It is a conjugate acid of a guanosine 3'-diphosphate 5'-triphosphate(7-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])OP(=O)([O-])OP(=O)(O)[O-])O)N=C(NC2=O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29453',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an enoate ester that is the methyl ester of (2E)-2-{2-[({(E)-[(3E)-4-(4-chlorophenyl)but-3-en-2-ylidene]amino}oxy)methyl]phenyl}-3-methoxyprop-2-enoic acid. A fungicide used for control of leaf blotch, leaf rust and powdery mildew on wheat and other fungal diseases on cucumbers, tomatoes and grapes. It has a role as a mitochondrial cytochrome-bc1 complex inhibitor and an antifungal agrochemical. It is an enoate ester, an enol ether, an oxime O-ether, a member of monochlorobenzenes, a methyl ester and a methoxyacrylate strobilurin antifungal agent.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C(=N\\\\\\\\OCC1=CC=CC=C1/C(=C\\\\\\\\OC)/C(=O)OC)/C=C/C2=CC=C(C=C2)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20084',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a penicillanic acid ester that is the pivaloyloxymethyl ester of ampicillin. It is a prodrug of ampicillin. It has a role as a prodrug. It is a penicillanic acid ester and a pivaloyloxymethyl ester. It derives from an ampicillin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=CC=C3)N)C(=O)OCOC(=O)C(C)(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6933',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a stilbenoid that is the (-)-cis-stereoisomer of epsilon-viniferin, obtained by cyclodimerisation of cis-resveratrol. It has a role as a metabolite. It is a member of 1-benzofurans, a polyphenol and a stilbenoid. It derives from a cis-resveratrol. It is an enantiomer of a (+)-cis-epsilon-viniferin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C\\\\\\\\C2=C3[C@H]([C@@H](OC3=CC(=C2)O)C4=CC=C(C=C4)O)C5=CC(=CC(=C5)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23471',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an alkaloid ester, a methyl ester, an aldehyde and an organic heterotetracyclic compound. It derives from a sarpagine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C/C=C\\\\\\\\1/CN2[C@H]3C[C@H]1[C@@]([C@@H]2CC4=C3NC5=CC=CC=C45)(C=O)C(=O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9239',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organic heteropentacyclic compound that is 6H-furo[3,2-c]xanthen-6-one substituted by hydroxy groups at positions 7 and 10, 2-methoxypropan-2-yl group at position 2 and a 2-methylbut-3-en-2-yl group at position 8. Isolated from the woods of Garcinia subelliptica, it exhibits antioxidant activity. It has a role as a metabolite and an antioxidant. It is a cyclic ether, a polyphenol, a cyclic ketone and an organic heterotetracyclic compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)(C=C)C1=CC(=C2C(=C1O)C(=O)C3=C(O2)C4=C(C=C3)C=C(O4)C(C)(C)OC)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7447',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a gamma-amino acid that is GABA in which one of the hydrogens at position 4 is replaced by a cyano group. It is a gamma-amino acid, a monocarboxylic acid and an aliphatic nitrile. It derives from a butyric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)O)C(C#N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2584',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a C5-acylcarnitine having isovaleryl as the acyl substituent. It has a role as a human metabolite. It derives from an isovaleric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6418',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an ultra-long-chain fatty acid anion that is the conjugate base of dotriacontanoic acid (lacceroic acid), obtained by deprotonation of the carboxy group. It is a straight-chain saturated fatty acid anion and an ultra-long-chain fatty acid anion. It derives from a 3-oxodotriacontanoyl-CoA(4-). It is a conjugate base of a dotriacontanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7597',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a labdane diterpenoid that is 15,16-epoxy-7,11,13(16)14-labdatetraen-6-one substituted by a hydroxy group at position 9. Isolated from the rhizomes of Hedychium spicatum, it exhibits cytotoxicity against the Colo-205 (Colo-cancer), A-431 (skin cancer), MCF-7 (breast cancer), A-549 (lung cancer) and Chinese hamster ovary cells (CHO). It has a role as a metabolite and an antineoplastic agent. It is a member of furans, an enone, a tertiary alcohol, a labdane diterpenoid and a member of hexahydronaphthalenes.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC(=O)[C@@H]2[C@@]([C@]1(/C=C/C3=COC=C3)O)(CCCC2(C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20446',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a butan-4-olide that is dihydrofuran-2(3H)-one substituted by a hydroxy group at position 4, a 2-hydroxy-2-methyl-14-phenyltetradeca-11,13-dien-1-yl group at position 5 and a methyl group at position 5 (the 4S,5S stereoisomer). It is isolated from the Australian marine sponge Plakinastrella clathrata. It has a role as a metabolite. It is a butan-4-olide and a diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]1([C@H](CC(=O)O1)O)C[C@](C)(CCCCCCCC/C=C/C=C/C2=CC=CC=C2)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24415',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a hydroxyaurone that is aurone which is substituted by hydroxy groups at the 3' and 4' positions; major species at pH 7.3. It shows inhibitory activity against several isoforms of the histone deacetylase complex (HDAC). It has a role as an EC 3.5.1.98 (histone deacetylase) inhibitor. It is a hydroxyaurone and a member of catechols. It derives from a 2',3,4-trihydroxy-trans-chalcone.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC=C2C(=C1)C(=O)/C(=C/C3=CC(=C(C=C3)O)O)/O2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15619',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an organic phosphonate that is the trisacetoxymethylester derivative of naphthalene substituted hydroxymethylphosphonic acid. It has been found to inhibit insulin receptor tyrosine kinase activity and insulin stimulated glucose oxidation. It has a role as a tyrosine kinase inhibitor. It is an organic phosphonate, a member of naphthalenes and an acetate ester. It derives from a hydroxymethylphosphonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)OCOC(C1=CC2=CC=CC=C2C=C1)P(=O)(OCOC(=O)C)OCOC(=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23609',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a member of furans and a member of tetrahydrothiophenes. It has a role as a pyrethroid ester insecticide and an agrochemical. It derives from a (-)-cis-chrysanthemic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1([C@@H]([C@@H]1C(=O)OCC2=COC(=C2)CC3=CC=CC=C3)/C=C/4\\\\\\\\CCSC4=O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1624',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is the N-(3-acetamidopropyl)-derivative of 4-aminobutanal. It is a monocarboxylic acid amide and an alpha-CH2-containing aldehyde. It is a conjugate base of a N-(3-acetamidopropyl)-4-ammoniobutanal.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)NCCCNCCCC=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15113',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an acyl-CoA(4-) oxoanion arising from deprotonation of the phosphate and diphosphate OH groups of 3-oxochola-4,22-dien-24-oyl-CoA; major species at pH 7.3. It is a conjugate base of a 3-oxochola-4,22-dien-24-oyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O)[C@H]4CC[C@@H]5[C@@]4(CC[C@H]6[C@H]5CCC7=CC(=O)CC[C@]67C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_518',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside derived from (7R)-7-hydroxyoctanoic acid. It is a metabolite of the nematode Caenorhabditis elegans. It has a role as a Caenorhabditis elegans metabolite. It is a 4-O-(1H-indol-3-ylcarbonyl)ascaroside, a monocarboxylic acid and an (omega-1)-hydroxy fatty acid ascaroside. It derives from an ascr#14 and a (7R)-7-hydroxyoctanoic acid.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)O[C@H](C)CCCCCC(=O)O)O)OC(=O)C2=CNC3=CC=CC=C32\\nThe natural language question is: The molecule is an acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate groups of oscr#23-CoA; major species at pH 7.3. It is a conjugate base of an oscr#23-CoA.\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCC/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)([O-])[O-])O)O)O\\nThe natural language question is: The molecule is the open chain form of L-tagatose 6-phosphate It is a conjugate acid of a keto-L-tagatose 6-phosphate(2-). It is an enantiomer of a keto-D-tagatose 6-phosphate.\\nThe corresponding SMILES representation is:\\nC([C@@H]([C@H]([C@H](C(=O)CO)O)O)O)OP(=O)(O)O\\nThe natural language question is: The molecule is a dibenzooxepine diterpenoid that is hexahydrodibenzo[b,e]oxepine with an isolated double bond between positions 6a and 7 and is substituted by a bromo, a carboxy, a 3E-4,8-dimethylnona-3,7-dien-1-yl and a methyl group at positions 9, 2, 10 and 10 respectively (the 9S,10S,10aR stereoisomer). It is isolated from the Fijian red alga Callophycus serratus and exhibits antibacterial, antimalarial and anticancer activities. It has a role as a metabolite, an antibacterial agent, an antimalarial and an antineoplastic agent. It is a member of benzoic acids, an organobromine compound, a cyclic ether, a diterpenoid and a dibenzooxepine.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC[C@@]1([C@H](CC=C2[C@H]1CC3=C(C=CC(=C3)C(=O)O)OC2)Br)C)/C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a monocarboxylic acid in which one of the alpha-hydrogens is substituted by a biphenyl-4-yl group. An active metabolite of fenbufen, it is used as a topical medicine to treat muscle inflammation and arthritis. It has a role as a non-steroidal anti-inflammatory drug. It is a member of biphenyls and a monocarboxylic acid. It contains a biphenyl-4-yl group. It derives from an acetic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C(C=C1)C2=CC=C(C=C2)CC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25775',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a naphthaldehyde that is naphthalene substituted by a formyl group at position 2. It has a role as a mouse metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC=C2C=C(C=CC2=C1)C=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22980',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a very long-chain omega-6 fatty acid that is tetracosanoic acid having five double bonds located at positions 6, 9, 12, 15 and 18 (the 6Z,9Z,12Z,15Z,18Z-isomer). It is an omega-6 fatty acid and a tetracosapentaenoic acid. It is a conjugate acid of a (6Z,9Z,12Z,15Z,18Z)-tetracosapentaenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7143',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a class of hydrolysable tannins obtained by condensation of the carboxy group of gallic acid (and its polymeric derivatives) with the hydroxy groups of a monosaccharide (most commonly glucose). It derives from a gallic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(C=C(C(=C1O)O)O)C(=O)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC(=O)C3=CC(=C(C(=C3)O)O)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12350',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a glucotriose consisting of three alpha-D-glucopyranose residues joined in sequence by (1->6) and (1->2) glycosidic bonds. It derives from an alpha-D-Glcp-(1->2)-alpha-D-Glcp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20548',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an alkene that is hex-1-ene substituted by methyl groups at positions 4 and 5 respectively. It has a role as a metabolite. It derives from a hydride of a 1-hexene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(C)C(C)CC=C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18982',\n", + " 'prompt': \"Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a member of the class of acridines that is 1,2,3,4-tetrahydroacridine substituted by an amino group at position 9. It is used in the treatment of Alzheimer's disease. It has a role as an EC 3.1.1.7 (acetylcholinesterase) inhibitor. It is a member of acridines and an aromatic amine.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1CCC2=NC3=CC=CC=C3C(=C2C1)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12214',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a cucurbitacin that is 9,10,14-trimethyl-4,9-cyclo-9,10-secocholesta-2,5,23-triene substituted by hydroxy groups at positions 2, 16, 20 and 25 and oxo groups at positions 1, 11 and 22. It has a role as a plant metabolite and an antineoplastic agent. It is a cucurbitacin and a tertiary alpha-hydroxy ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@]12C[C@H]([C@@H]([C@]1(CC(=O)[C@@]3([C@H]2CC=C4[C@H]3C=C(C(=O)C4(C)C)O)C)C)[C@](C)(C(=O)/C=C/C(C)(C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14711',\n", + " 'prompt': \"Render the natural language description of the molecule into the corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organic cation obtained by protonation of the two free amino groups of 2'-deamino-2'-hydroxyparomamine; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a 2'-deamino-2'-hydroxyparomamine.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@@H]([C@H]([C@@H]([C@H]1[NH3+])O[C@@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O)O)O)[NH3+]\\nThe natural language question is: The molecule is an aminoglycoside sulfate salt and an erythromycin derivative. It has a role as an enzyme inhibitor. It contains an erythromycin A 2'-propanoate.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCOS(=O)(=O)O.CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)OC)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)OC(=O)CC)(C)O)C)C)O)(C)O\\nThe natural language question is: The molecule is a member of the class of indoles that is methyl 2-(1H-indol-2-yl)prop-2-enoate in which the indole moiety has been substituted at position 3 by a 2-(5-ethylpyridin-1(2H)-yl)ethyl group. An intermediate in the biosynthesis of aspidosperma and iboga alkaloids. It is a terpenoid indole alkaloid, a methyl ester, a dihydropyridine, a member of indoles, an alkaloid ester and an enamine. It is a conjugate base of a dehydrosecodine(1+).\\nThe corresponding SMILES representation is:\\nCCC1=CN(CC=C1)CCC2=C(NC3=CC=CC=C32)C(=C)C(=O)OC\\nThe natural language question is: The molecule is a C19 straight-chain fatty acid of plant or bacterial origin. An intermediate in the biodegradation of n-icosane, it has been shown to inhibit cancer growth. It has a role as a fungal metabolite. It is a straight-chain saturated fatty acid and a long-chain fatty acid. It is a conjugate acid of a nonadecanoate.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCC(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an amino acid zwitterion, obtained by transfer of a proton from the carboxylic acid group to the amino group of (1R,6S)-6-amino-5-oxocyclohex-2-ene-1-carboxylic acid. It is an enantiomer of a (1S,6R)-6-ammonio-5-oxocyclohex-2-ene-1-carboxylate. It is a tautomer of a (1R,6S)-6-amino-5-oxocyclohex-2-ene-1-carboxylic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1C=C[C@H]([C@@H](C1=O)[NH3+])C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_119',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a cinnamate ester obtained by formal condensation of the carboxy group of trans-ferulic acid with one of the hydroxy groups of L-tartaric acid. It has a role as a metabolite. It derives from a ferulic acid and a L-tartaric acid. It is an enantiomer of a (2S,3S)-trans-fertaric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)/C=C/C(=O)O[C@H]([C@H](C(=O)O)O)C(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8917',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a jadomycin that is jadomycin A in which the phenolic hydroxy group at position 12 has been converted to the corresponding 2,6-dideoxy-alpha-L-ribo-hexopyranoside, isolated from Streptomyces venezuelae. It exists as a diastereoisomeric mixture consisting of both 3aS and 3aR isomers. It has a role as a bacterial metabolite, an apoptosis inducer, an Aurora kinase inhibitor, an antibacterial agent and an antineoplastic agent. It is a jadomycin, an organic heteropentacyclic compound and a glycoside. It derives from a jadomycin A.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC[C@H](C)[C@H]1C(=O)OC2N1C3=C(C4=C2C=C(C=C4O)C)C(=O)C5=C(C3=O)C(=CC=C5)O[C@H]6C[C@H]([C@H]([C@@H](O6)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_539',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a phosphatidylethanolamine 38:4 zwitterion obtained by transfer of a proton from the phosphate to the primary amino group of 1-octadecanoyl-2-(5Z,8Z,11Z,14Z-icosatetraenoyl)-sn-glycero-3-phosphoethanolamine; major species at pH 7.3. It is a tautomer of a 1-stearoyl-2-arachidonoyl-sn-glycero-3-phosphoethanolamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_17613',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a nucleotide-sugar oxoanion arising from deprotonation of the free diphosphate OH groups of dTDP-4-dehydro-6-deoxy-beta-D-gulose. It is a conjugate base of a dTDP-4-dehydro-6-deoxy-beta-D-gulose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C(=O)[C@H]([C@H]([C@@H](O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29008',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is the stable isotope of gold with relative atomic mass 196.966552, 100 atom percent natural abundance and nuclear spin 3/2.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[197Au]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25819',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an oligosaccharide phosphate corresponding to the oligosaccharide repeating unit of Leishmania major promastigote lipophosphoglycan. It has a role as an epitope. It is an oligosaccharide phosphate and a trisaccharide derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)O[C@H]2[C@H]([C@H](O[C@H]([C@@H]2O)O[C@@H]3[C@H](O[C@@H]([C@H]([C@H]3O)O)O)CO)COP(=O)(O)O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16645',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a benzenedicarboxamide compound having N-(2,3-dihydroxypropyl)carbamoyl groups at the 1- and 3-positions, iodo substituents at the 2-, 4- and 6-positions and an N-(2,3-dihydroxypropyl)acetamido group at the 5-position. It has a role as a radioopaque medium, an environmental contaminant and a xenobiotic. It is an organoiodine compound and a benzenedicarboxamide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N(CC(CO)O)C1=C(C(=C(C(=C1I)C(=O)NCC(CO)O)I)C(=O)NCC(CO)O)I'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19130',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a fatty acid ester obtained by formal condensation of the carboxy group of (9Z)-hexadecenoic acid with the hydroxy group of 12-hydroxyoctadecanoic acid. It is a fatty acid ester and a monocarboxylic acid. It derives from a palmitoleic acid and a 12-hydroxyoctadecanoic acid. It is a conjugate acid of a 12-[(9Z)-hexadecenoyloxy]octadecanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCC/C=C\\\\\\\\CCCCCCCC(=O)OC(CCCCCC)CCCCCCCCCCC(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24058',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a cholanic acid anion that is the conjugate base of ursocholic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of an ursocholic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)[O-])[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_2438',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a polyprenyl glycosyl phosphate consisting of beta-D-arabinofuranose attached at the 1-position to trans,octacis-decaprenyl phosphate. It is a conjugate acid of a trans,octacis-decaprenylphospho-beta-D-arabinofuranose(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)(O)O[C@H]1[C@H]([C@@H]([C@H](O1)CO)O)O)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23666',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a member of the class of phenylureas that is urea in which a hydrogen attached to one nitrogen is replaced by a cyclohexyl group and a hydrogen attached to the other nitrogen is replaced by a p-iodophenyl group. It is an organoiodine compound and a member of phenylureas.\\nThe corresponding SMILES representation is:\\nC1CCC(CC1)NC(=O)NC2=CC=C(C=C2)I\\nThe natural language question is: The molecule is an L-alpha-amino acid zwitterion that is L-alpha-aminobutyric acid in which a proton has been transferred from the carboxy group to the amino group. It is the major species at pH 7.3. It is a tautomer of a L-alpha-aminobutyric acid.\\nThe corresponding SMILES representation is:\\nCC[C@@H](C(=O)[O-])[NH3+]\\nThe natural language question is: The molecule is a trifluorinated corticosteroid that consists of 6alpha,9-difluoro-11beta,17alpha-dihydroxy-17beta-{[(fluoromethyl)sulfanyl]carbonyl}-16-methyl-3-oxoandrosta-1,4-diene bearing a propionyl substituent at position 17; has anti-inflammatory, anti-asthmatic and anti-allergic activity. It has a role as an anti-allergic agent, an anti-asthmatic drug, an anti-inflammatory drug, a dermatologic drug, a bronchodilator agent and an adrenergic agent. It is a corticosteroid, a steroid ester, an 11beta-hydroxy steroid, a propanoate ester, a fluorinated steroid, a thioester and a 3-oxo-Delta(1),Delta(4)-steroid. It derives from a fluticasone. It derives from a hydride of an androstane.\\nThe corresponding SMILES representation is:\\nCCC(=O)O[C@@]1([C@@H](C[C@@H]2[C@@]1(C[C@@H]([C@]3([C@H]2C[C@@H](C4=CC(=O)C=C[C@@]43C)F)F)O)C)C)C(=O)SCF\\nThe natural language question is: The molecule is lipid IVA glycosylated with two 3-deoxy-D-manno-octulosonic acid (KDO) residues. It has a role as an Escherichia coli metabolite. It is a conjugate acid of a (KDO)2-lipid IVA(6-).\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCC[C@H](CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)(O)O)CO[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO[C@@]3(C[C@H]([C@H]([C@H](O3)[C@@H](CO)O)O)O[C@@]4(C[C@H]([C@H]([C@H](O4)[C@@H](CO)O)O)O)C(=O)O)C(=O)O)OP(=O)(O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)NC(=O)C[C@@H](CCCCCCCCCCC)O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is an acyl-CoA having acetyl as its S-acetyl component. It has a role as an effector, a coenzyme, an acyl donor and a fundamental metabolite. It derives from an acetic acid and a coenzyme A. It is a conjugate acid of an acetyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9825',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of beta-carbolines that is 9H-beta-carboline substituted by a methoxy group at position 7 and methyl groups at positions 1 and 2. It is a semisynthetic derivative of harmine and has been shown to exhibit significant anti-HIV activity. It has a role as an anti-HIV agent. It is a member of beta-carbolines, an aromatic ether and a semisynthetic derivative. It derives from a harmine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C2C(=C3C=CC(=CC3=N2)OC)C=CN1C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14560',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an N-acyl-sn-glycero-3-phosphoethanolamine in which the N-acyl group is specified as (4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl. It derives from an all-cis-docosa-4,7,10,13,16,19-hexaenoic acid. It is a conjugate acid of a N-(4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoyl-sn-glycero-3-phosphoethanolamine(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)NCCOP(=O)(O)OC[C@@H](CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10128',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a steroid glucuronide anion that is the conjugate base of lithocholic acid 3-O-(beta-D-glucuronide) arising from deprotonation of the carboxylic acid functions; major species at pH 7.3. It is a steroid glucosiduronic acid anion, a beta-D-glucosiduronate and a dicarboxylic acid dianion. It is a conjugate base of a lithocholic acid 3-O-(beta-D-glucuronide).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=O)[O-])[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC[C@H]4[C@@]3(CC[C@H](C4)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)[O-])O)O)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3143',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is the para-isomer of ATTO 565 cation. It has a role as a fluorochrome. It is a xanthene dye, a dicarboxylic acid, an organic heteropentacyclic compound and an organic cation.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCN1CCCC2=CC3=C(C=C21)OC4=CC5=[N+](CCCC5=CC4=C3C6=C(C=CC(=C6)C(=O)O)C(=O)O)CC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_315',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a methyl ester resulting from the formal condensation of the carboxy group of halauxifen with methanol. It is a proherbicide used for the control of broad-leaved weeds in cereals and oilseed rape. It has a role as a synthetic auxin and a proherbicide. It is a member of monofluorobenzenes, a member of monochlorobenzenes, a monomethoxybenzene, an aminopyridine, a methyl ester, a chloropyridine and a biaryl. It derives from a halauxifen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1F)C2=NC(=C(C(=C2)N)Cl)C(=O)OC)Cl'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19853',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an organic cation that is the conjugate acid of (R)-adrenaline, obtained by protonation of the amino group; major species at pH 7.3. It has a role as a human metabolite. It is an organic cation and an ammonium ion derivative. It is a conjugate acid of a (R)-adrenaline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[NH2+]C[C@@H](C1=CC(=C(C=C1)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16251',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a carboxylic acid trianion resulting from the deprotonation of the phosphate group and both carboxy groups of 7,8-dihydromethanopterin. The major species at pH 7.3. It is a carboxylic acid trianion and an organophosphate oxoanion. It is a conjugate base of a 7,8-dihydromethanopterin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1C(=NC2=C(N1)N=C(NC2=O)N)[C@@H](C)NC3=CC=C(C=C3)C[C@@H]([C@@H]([C@@H](CO[C@@H]4[C@@H]([C@@H]([C@H](O4)COP(=O)([O-])O[C@@H](CCC(=O)[O-])C(=O)[O-])O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6318',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a member of the class of benzamides that is 4-(imidazol-2-yl)benzamide carrying additional 1,3-benzodioxol-5-yl and pyridin-2-yl substituents at positions 4 and 5 respectively on the imidazole ring. It has a role as an EC 2.7.10.1 (receptor protein-tyrosine kinase) inhibitor. It is a member of benzamides, a member of imidazoles, a member of pyridines and a member of benzodioxoles.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1OC2=C(O1)C=C(C=C2)C3=C(NC(=N3)C4=CC=C(C=C4)C(=O)N)C5=CC=CC=N5'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15536',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an (omega-1)-hydroxy fatty acid that is hexadecanoic acid in which the 15-pro-R hydrogen is replaced by a hydroxy group. It is an (omega-1)-hydroxy fatty acid and a long-chain fatty acid. It derives from a hexadecanoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCCCCCCCCCCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18121',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a fatty amide resulting from the formal condensation of the carboxy group of (Z)-dodec-2-enoic acid with the nitrogen of piperidine. It is a fatty amide, an enamide, a N-acylpiperidine and a tertiary carboxamide. It derives from a piperidine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCC/C=C\\\\\\\\C(=O)N1CCCCC1'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20971',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a D-fructofuranose 1,6-bisphosphate with a beta-configuration at the anomeric position. It has a role as a mouse metabolite. It derives from a beta-D-fructofuranose. It is a conjugate acid of a beta-D-fructofuranose 1,6-bisphosphate(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@](O1)(COP(=O)(O)O)O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11849',\n", + " 'prompt': \"Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an organonitrogen heterocyclic antibiotic that is 2,4'-bi-1,3-thiazole substituted by an isopropyl group at position 2' and a 3,5,7-trimethoxy-4-methyl-7-oxohepta-1,5-dien-1-yl group at position 4 (the 2E,4R,5S,6E stereoisomer). It is isolated from the culture broth of myxobacterium, Cystobacter fuscus, and exhibits antifungal and cytotoxic activity. It has a role as an antifungal agent, an antineoplastic agent and a bacterial metabolite. It is an organonitrogen heterocyclic antibiotic, a member of 1,3-thiazoles, an enol ether, an enoate ester, a biaryl and a methyl ester.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C[C@H]([C@H](/C=C/C1=CSC(=N1)C2=CSC(=N2)C(C)C)OC)/C(=C\\\\\\\\C(=O)OC)/OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13885',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a branched amino decasaccharide consisting of a hexasaccharide chain of beta-D-galactose, N-acetyl-beta-D-glucosamine, alpha-D-mannose, beta-D-mannose, and two N-acetyl-beta-D-glucosamine residues linked sequentially (1->4), (1->2), (1->3), (1->4) and (1->4), to the beta-D-mannose residue of which is (1->6)-linked a beta-D-galactosyl-(1->4)-N-acetyl-beta-D-glucosaminyl-(1->3)-alpha-D-mannosyl trisaccharide branch and to the reducing end N-acetyl-beta-D-glucosamine residue of which is (1->6)-linked an alpha-L-fucose residue. When it is the N-glycan content of the tumour necrosis factor (TNF) alpha blocker adalimumab, the two terminal galactose residues may be either absent or alternatively the linkage to the GlcNAc residues may be (1->3), while the fucose residue may be absent. It has a role as an epitope. It is an amino decasaccharide, a glucosamine oligosaccharide and a N-glycan derivative.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)O)NC(=O)C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O[C@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO)O)O)O)O)NC(=O)C)O)O[C@@H]8[C@H]([C@H]([C@@H]([C@H](O8)CO)O)O)O[C@H]9[C@@H]([C@H]([C@@H]([C@H](O9)CO)O[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)O)NC(=O)C)O)O)NC(=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23117',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an organic cation obtained by protonation of the amino function of pavine; major species at pH 7.3. It is an ammonium ion derivative and an organic cation. It is a conjugate acid of a pavine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=C2C3CC4=CC(=C(C=C4C([NH2+]3)CC2=C1)OC)OC)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11650',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a DiHDPA obtained by formal dihydroxylation of the 19,20-double bond of docosa-4,7,10,13,16,19-hexaenoic acid. It has a role as a metabolite.\\nThe corresponding SMILES representation is:\\nCCC(C(C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCC(=O)O)O)O\\nThe natural language question is: The molecule is an aromatic ketone that is aniline substituted at position 2 by a 3-aminopropanoyl group. It has a role as a metabolite. It is a substituted aniline, a primary amino compound and an aromatic ketone.\\nThe corresponding SMILES representation is:\\nC1=CC=C(C(=C1)C(=O)CCN)N\\nThe natural language question is: The molecule is a flavin adenine dinucleotide that is FADH2 bearing an N-sulfo substituent at position 5 of the flavin ring system.. It is a member of sulfamic acids and a flavin adenine dinucleotide. It derives from a FADH2.\\nThe corresponding SMILES representation is:\\nCC1=CC2=C(C=C1C)N(C3=C(N2C[C@@H]([C@@H]([C@@H](COP(=O)(O)OP(=O)(O)OC[C@@H]4[C@H]([C@H]([C@@H](O4)N5C=NC6=C(N=CN=C65)N)O)O)O)O)O)NC(=O)NC3=O)S(=O)(=O)O\\nThe natural language question is: The molecule is a 3alpha-hydroxy steroid, a 7alpha-hydroxy steroid, a 26-oxo steroid and a steroid aldehyde. It has a role as a human metabolite and a mouse metabolite. It derives from a hydride of a 5beta-cholestane.\\nThe corresponding SMILES representation is:\\nC[C@H](CCCC(C)C=O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 1-pyrrolinecarboxylate resulting from the removal of the proton from the carboxy group of 1-pyrroline-5-carboxylic acid. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a conjugate base of a 1-pyrroline-5-carboxylic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CC(N=C1)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13697',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a 4-hydroxy monocarboxylic acid that is butyric acid in which one of the hydrogens at position 4 is replaced by a hydroxy group. It has a role as a general anaesthetic, a GHB receptor agonist, a sedative and a neurotoxin. It is a 4-hydroxy monocarboxylic acid and a hydroxybutyric acid. It derives from a butyric acid. It is a conjugate base of a 4-hydroxybutyrate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CC(=O)O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4750',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an organosulfonate oxoanion obtained by deprotonation of the sulfonic acid groups of naphthalene blue black CS (acid form). It is a conjugate base of a naphthalene blue black CS (acid form).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1N=NC2=C(C=C3C=C(C(=C(C3=C2N)O)N=NC4=CC=C(C=C4)S(=O)(=O)[O-])S(=O)(=O)[O-])S(=O)(=O)[O-])[N+](=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12794',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (5Z,8Z,11Z,14Z,17Z)-icosapentaenoic acid. It is a member of n-3 PUFA and by-product of alpha-linolenic acid metabolism. It is an unsaturated fatty acyl-CoA, a long-chain fatty acyl-CoA and a timnodonoyl bioconjugate. It is a conjugate acid of a (5Z,8Z,11Z,14Z,17Z)-icosapentaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_18383',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a 2-[(ethanesulfonyl)amino]-5-fluoro-4-[4-methyl-5-oxo-3-(trifluoromethyl)-4,5-dihydro-1H-1,2,4-triazol-1-yl]benzene-1-carbothioamide that has (S)-configuration. It has a role as a fungicide. It is an enantiomer of a (R)-fluoxapiprolin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CS(=O)(=O)OC1=C(C(=CC=C1)Cl)[C@@H]2CC(=NO2)C3=CSC(=N3)C4CCN(CC4)C(=O)CN5C(=CC(=N5)C(F)F)C(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16047',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a member of the class of xanthones that is xanthen-9-one substituted at position 4 by a nitro group and at positions 1 and 8 by hydroxy groups. It has a role as a protein kinase inhibitor. It is a member of xanthones, a polyphenol and a C-nitro compound.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C2C(=C1)OC3=C(C=CC(=C3C2=O)O)[N+](=O)[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3442',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a cholestanoid that is cholest-4-en-26-oic acid substituted by an alpha-hydroxy group at position 7 and an oxo group at position 3. It is an intermediate metabolite in the bile acid synthesis. It has a role as a human metabolite. It is a 3-oxo-Delta(4) steroid, a 7alpha-hydroxy steroid, a cholestanoid, a steroid acid and a monocarboxylic acid. It is a conjugate acid of a 7alpha-hydroxy-3-oxo-4-cholestenoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCCC(C)C(=O)O)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2[C@@H](CC4=CC(=O)CC[C@]34C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6194',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is dianion of UDP-N-acetyl-alpha-D-glucosamine arising from deprotonation of both free diphosphate OH groups; major species at pH 7.3. It has a role as a human metabolite and a Saccharomyces cerevisiae metabolite. It is a nucleotide-sugar oxoanion and an UDP-monosaccharide(2-). It is a conjugate base of an UDP-N-acetyl-alpha-D-glucosamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21416',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a dTDP-4-amino-4,6-dideoxy-D-galactose(1-) in which the anomeric centre of the pyranose fragment has alpha-configuration. It is a conjugate base of a dTDP-4-amino-4,6-dideoxy-alpha-D-galactose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O)O)[NH3+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27605',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a D-alpha-amino acid that is 6-chlorotryptophan in which the chiral centre has D- (R-) configuration. It is a D-tryptophan derivative, a 6-chlorotryptophan and a D-alpha-amino acid. It is a tautomer of a 6-chloro-D-tryptophan zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC2=C(C=C1Cl)NC=C2C[C@H](C(=O)O)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1248',\n", + " 'prompt': \"Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a methyluridine that consists of uridine bearing two methyl substituents located at position C-5 on the uracil ring and position O-2' on the ribose ring.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CC1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)CO)O)OC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_7048',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is an amino oligosaccharide (dodecasaccharide) consisting of two tetrasaccharide units, each consisting of two beta-D-galactose residues, one N-acetyl-beta-D-glucosamine residue and one alpha-L-mannose residue linked in sequence (1->4), (1->4) and (1->2), linked (1->2) and (1->6) to the mannose residue of an amino trisaccharide comprising beta-D-mannose, N-acetyl-beta-D-glucosamine and N-acetyl-D-glucosamine residues all linked (1->4), and to the reducing-end N-acetyl-D-glucosamine residue of which is also linked (1->6) an alpha-L-fucosyl residue. It is an amino oligosaccharide and a glucosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@H]([C@H]([C@@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H](C(O2)O)NC(=O)C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O[C@H]4[C@H]([C@H]([C@@H]([C@H](O4)CO[C@@H]5[C@H]([C@H]([C@@H]([C@H](O5)CO)O)O)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O[C@H]7[C@@H]([C@H]([C@H]([C@H](O7)CO)O[C@H]8[C@@H]([C@H]([C@H]([C@H](O8)CO)O)O)O)O)O)O)NC(=O)C)O)O[C@@H]9[C@H]([C@H]([C@@H]([C@H](O9)CO)O)O)O[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O[C@H]1[C@@H]([C@H]([C@H]([C@H](O1)CO)O)O)O)O)O)O)NC(=O)C)O)O)NC(=O)C)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_279',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a propanone that is acetone in which one of the methyl hydrogens is replaced by a hydroxy group. It has a role as a human metabolite, an Escherichia coli metabolite and a mouse metabolite. It is a member of propanones, a methyl ketone, a primary alcohol and a primary alpha-hydroxy ketone. It derives from an acetone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)CO'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_460',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an alpha-D-Galp-(1->3)-D-Manp in which the mannopyranose moiety has beta- configuration at the anomeric centre. It has a role as an epitope.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@@H]([C@@H](O1)O)O)O[C@@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13321',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is the penta-anion resulting from the removal of protons from each of the sulfonic acid groups of colistimethate B. It is a conjugate base of a colistimethate B.\\nThe corresponding SMILES representation is:\\nC[C@H]([C@H]1C(=O)NCC[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N1)CCNCS(=O)(=O)[O-])CCNCS(=O)(=O)[O-])CC(C)C)CC(C)C)CCNCS(=O)(=O)[O-])NC(=O)[C@H](CCNCS(=O)(=O)[O-])NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](CCNCS(=O)(=O)[O-])NC(=O)CCCCC(C)C)O\\nThe natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (21Z,24Z,27Z,30Z,33Z)-hexatriacontapentaenoic acid. It is an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It derives from a (21Z,24Z,27Z,30Z,33Z)-hexatriacontapentaenoic acid. It is a conjugate acid of a (21Z,24Z,27Z,30Z,33Z)-hexatriacontapentaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\nCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O\\nThe natural language question is: The molecule is an 11,12-EET in which the epoxy moiety has 11S,12R-configuration. It has a role as a human xenobiotic metabolite. It is a conjugate acid of an (11S,12R)-EET(1-). It is an enantiomer of an (11R,12S)-EET.\\nThe corresponding SMILES representation is:\\nCCCCC/C=C\\\\\\\\C[C@@H]1[C@@H](O1)C/C=C\\\\\\\\C/C=C\\\\\\\\CCCC(=O)O\\nThe natural language question is: The molecule is a 3beta-sterol having the structure of desmosterol with an extra double bond at C-7--C-8. It has a role as a human metabolite and a mouse metabolite. It derives from a desmosterol.\\nThe corresponding SMILES representation is:\\nC[C@H](CCC=C(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3C2=CC=C4[C@@]3(CC[C@@H](C4)O)C)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is a 14-membered macrolide which is isolated from Streptomyces sp.MK929-43F1 and inhibits cell migration of human esophageal cancer EC17 cells and mouse melanona B16 cells. It has a role as a metabolite and an antineoplastic agent. It is an ether, a secondary alcohol, a member of piperidones and a macrolide antibiotic.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1/C=C(\\\\\\\\[C@H](OC(=O)/C=C/CC/C=C/[C@@H]([C@H]1O)OC)[C@H](C)C(=O)CCCC2CC(=O)NC(=O)C2)/C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3296',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is the tetracyclic anthrafuran form of versicolorone. It is an anthrafuran, a versicolorone and a member of p-quinones.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)CCC1COC2=C1C(=C3C(=C2)C(=O)C4=C(C3=O)C(=CC(=C4)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28537',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a semisynthetic derivative obtained by alkaline hydrolysis of the macrocyclic lactone ring of brefeldin A. It is a secondary allylic alcohol, a triol, an alpha,beta-unsaturated monocarboxylic acid, a 4-hydroxy monocarboxylic acid, an alicyclic compound and a semisynthetic derivative. It derives from a brefeldin A.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H](CCC/C=C/[C@@H]1C[C@@H](C[C@H]1[C@@H](/C=C/C(=O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8134',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a polyprenyl glycosyl diphosphate having eleven prenyl units and with 4-O-[1-D-ribitylphosphono-(2R)-1-glycerylphosphono]-N-acetyl-beta-D-mannosaminyl-(1->4)-N-acetyl-alpha-D-glucosaminyl as the glycosyl component. It is a conjugate acid of a 4-O-[1-D-ribitylphosphonato-(2R)-1-glycerylphosphonato]-N-acetyl-beta-D-mannosaminyl-(1->4)-N-acetyl-alpha-D-glucosaminyl ditrans,octacis-undecaprenyl diphosphate(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC/C(=C/CC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)(O)OP(=O)(O)O[C@@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)OP(=O)(O)OC[C@@H](COP(=O)(O)OC[C@@H]([C@@H]([C@@H](CO)O)O)O)O)O)NC(=O)C)O)NC(=O)C)/C)/C)/C)/C)/C)/C)/C)/C)/C)/C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16666',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is dianion of UDP-alpha-D-xylose arising from deprotonation of both free diphosphate OH groups. It has a role as a human metabolite. It is a nucleotide-sugar oxoanion and an UDP-monosaccharide(2-). It is a conjugate base of an UDP-alpha-D-xylose.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H]([C@H]([C@H](O1)OP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25780',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a 1,2-diglyceride with lauroyl and oleoyl as the two acyl groups. It is a 1,2-diglyceride and a dodecanoate ester. It derives from an oleic acid.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC\\nThe natural language question is: The molecule is a one-carbon compound in which the carbon is joined only to a single oxygen. It is a colourless, odourless, tasteless, toxic gas. It has a role as a neurotoxin, a signalling molecule, a vasodilator agent, a neurotransmitter, a metabolite, a P450 inhibitor, a ligand, a biomarker, a probe, a human metabolite, a mouse metabolite, an EC 1.9.3.1 (cytochrome c oxidase) inhibitor and a mitochondrial respiratory-chain inhibitor. It is a one-carbon compound, a gas molecular entity and a carbon oxide. It is a conjugate base of a carbon monoxide(1+).\\nThe corresponding SMILES representation is:\\n[C-]#[O+]\\nThe natural language question is: The molecule is a 1-alkyl-sn-glycero-3-phosphocholine in which the alkyl group is specified as oleyl (9Z-octadecenyl). It is a 1-alkyl-sn-glycero-3-phosphocholine and a lysophosphatidylcholine O-18:1.\\nThe corresponding SMILES representation is:\\nCCCCCCCC/C=C\\\\\\\\CCCCCCCCOC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)O\\nThe natural language question is: The molecule is an ammonium ion derivative and a member of adamantanes. It is a conjugate acid of an amantadine. It derives from a hydride of an adamantane.\\nThe corresponding SMILES representation is:\\nC1C2CC3CC1CC(C2)(C3)[NH3+]\\nNext, you will be given a sample for test.The natural language question is: The molecule is a lipid A derivative in which the phospho group at the anomeric carbon is esterified with a 4-amino-4-deoxy-beta-L-arabinopyranosyl group. It is a conjugate acid of a lipid IIA(3-) and a lipid IIA(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCC[C@H](CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@@H]1OP(=O)(O)O[C@@H]2[C@@H]([C@H]([C@H](CO2)N)O)O)CO[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)OP(=O)(O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)NC(=O)C[C@@H](CCCCCCCCCCC)O)O)OC(=O)C[C@@H](CCCCCCCCCCC)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20872',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a 3-oxo-fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of 7-methyl-3-oxooct-6-enoic acid. It has a role as a mouse metabolite. It derives from a 7-methyl-3-oxooctanoic acid. It is a conjugate acid of a 7-methyl-3-oxooct-6-enoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1952',\n", + " 'prompt': \"Generate the SMILES code from the verbal description of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an iminium ion that is the cationic component of the histological dye 'ethyl violet'. It has a role as a fluorochrome and a histological dye.\\nThe corresponding SMILES representation is:\\nCCN(CC)C1=CC=C(C=C1)C(=C2C=CC(=[N+](CC)CC)C=C2)C3=CC=C(C=C3)N(CC)CC\\nThe natural language question is: The molecule is the conjugate base of 1,4,5,6-tetrahydro-6-oxonicotinic acid; major species at pH 7.3. It is a conjugate base of a 1,4,5,6-tetrahydro-6-oxonicotinic acid.\\nThe corresponding SMILES representation is:\\nC1CC(=O)NC=C1C(=O)[O-]\\nThe natural language question is: The molecule is a dihydroagarofuran sesquiterpenoid that consists of dihydro-beta-agarofuran substituted by acetoxy groups at positions 1 and 8 and furoyloxy groups at positions 6 and 9 (the 1beta,8beta,6alpha,9alpha stereoisomer). Isolated from Celastrus orbiculatus and exhibits inhibition of both NF-kappaB activation and nitric oxide production. It has a role as a metabolite and a NF-kappaB inhibitor. It is an acetate ester, a bridged compound, a cyclic ether, a dihydroagarofuran sesquiterpenoid and an organic heterotricyclic compound. It derives from a 3-furoic acid.\\nThe corresponding SMILES representation is:\\nC[C@@H]1CC[C@@H]([C@@]2([C@]13[C@@H]([C@@H]([C@H]([C@@H]2OC(=O)C4=COC=C4)OC(=O)C)C(O3)(C)C)OC(=O)C5=COC=C5)C)OC(=O)C\\nThe natural language question is: The molecule is a docosenoic acid having a cis-double bond at position 11. It has a role as a Daphnia tenebrosa metabolite. It is a conjugate acid of a cetoleate.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCC/C=C\\\\\\\\CCCCCCCCCC(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a nucleotide-sugar oxoanion arising from deprotonation of the diphosphate OH groups of 3''-O-butanoyl-ADP-D-ribose; major species at pH 7.3. It derives from an ADP-D-ribose(2-).\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'CCC(=O)O[C@@H]1[C@H](OC([C@@H]1O)O)COP(=O)([O-])OP(=O)([O-])OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6965',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an amino trisaccharide that is 2-acetamido-2-deoxy-beta-D-glucopyranose in which the hydroxy groups at positions 3 and 4 have been converted to the corresponding beta-D-galactopyranosyl and 4,6-dideoxy-alpha-L-xylo-hexopyranosyl derivatives, respectively. It is a member of acetamides and an amino trisaccharide. It derives from a beta-D-Galp-(1->3)-beta-D-GlcpNAc.\\nThe corresponding SMILES representation is:\\nC[C@H]1C[C@H]([C@@H]([C@@H](O1)O[C@@H]2[C@H](O[C@H]([C@@H]([C@H]2O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)NC(=O)C)O)CO)O)O\\nThe natural language question is: The molecule is a long-chain alkane consisting of an unbranched chain of 44 carbon atoms. It has a role as a human metabolite.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\\nThe natural language question is: The molecule is a cyclodepsipeptide isolated from the marine sponge Sidonops microspinosa and has been shown to exhibit anti-HIV-1 activity. It has a role as a metabolite and an anti-HIV-1 agent. It is a cyclodepsipeptide and an organosulfonic acid.\\nThe corresponding SMILES representation is:\\nCCC(C)(C)[C@@H](C(=O)N[C@H](CC1=CNC2=CC=CC=C21)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@H](CS(=O)(=O)O)C(=O)N[C@H]3[C@H](OC(=O)[C@H](NC(=O)[C@@H]4CCCN4C(=O)[C@H](NC(=O)[C@@H](N(C3=O)C)CCC(=O)N)C(C)C)CC(=O)O)C)NC(=O)[C@@H](C(C)(C)C)NC(=O)[C@@H]5CCCN5C(=O)C(C(C6=CC=C(C=C6)Br)O)NC(=O)[C@@H](C)NC=O\\nThe natural language question is: The molecule is a C79 mycolic acid having a C53 meromycolic chain with two cis cyclopropyl functions and a saturated C26 alpha-branch. It is produced by Mycobacterium tuberculosis H37Ra. It has a role as a bacterial metabolite. It is a mycolic acid and a hydroxy fatty acid. It is a conjugate acid of a (2R)-2-[(1R)-1-hydroxy-18-{2-[10-(2-nonadecylcyclopropyl)decyl]cyclopropyl}octadecyl]hexacosanoate.\\nThe corresponding SMILES representation is:\\nCCCCCCCCCCCCCCCCCCCCCCCC[C@H]([C@@H](CCCCCCCCCCCCCCCCCC1CC1CCCCCCCCCCC2CC2CCCCCCCCCCCCCCCCCCC)O)C(=O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a steroid acid that is 9beta,19-cyclo-5alpha-ergost-24(28)-ene-4alpha-carboxylic acid which is substituted by hydroxy groups at the 3beta, 8alpha, and 11alpha positions, by a methyl group at the 4beta position, and by oxo groups at positions 1 and 6. It is a cholesterol and ergosterol synthesis inhibitor isolated from the fungus Sporormiella minima that specifically targets the enzyme, sterol-4-alpha-carboxylate-3-dehydrogenase, encoded by ERG26 in budding yeast, and NSDHL in humans. It has a role as an EC 1.1.1.170 [3beta-hydroxysteroid-4alpha-carboxylate 3-dehydrogenase (decarboxylating)] inhibitor, a fungal metabolite, an antifungal agent and an ergosterol biosynthesis inhibitor. It is a 3beta-hydroxy steroid, a 6-oxo steroid, an 11alpha-hydroxy steroid, a member of cyclopropanes, a steroid acid and an 8-hydroxy steroid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CCC(=C)C(C)C)[C@H]1CC[C@@H]2[C@@]1(C[C@H]([C@]34[C@@]2(CC(=O)[C@@H]5[C@]3(C4)C(=O)C[C@@H]([C@@]5(C)C(=O)O)O)O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_248',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a 1-acyl-sn-glycero-3-phospho-1D-myo-inositol in which the 1-acyl group is specified as linoleoyl. It derives from a linoleic acid. It is a conjugate acid of a 1-linoleoyl-sn-glycero-3-phospho-D-myo-inositol(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCC(=O)OC[C@H](COP(=O)(O)OC1[C@@H]([C@H](C([C@H]([C@H]1O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26616',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an organosulfonate salt composed from 2-methoxy-4-nitrobenzene-1-diazonium and 5-sulfonaphthalene-1-sulfonate in a 1:1 ratio. Used for demostrating enterochromaffin in carcinoid tumours. It has a role as a histological dye. It contains a 2-methoxy-4-nitrobenzenediazonium.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'COC1=C(C=CC(=C1)[N+](=O)[O-])[N+]#N.C1=CC2=C(C=CC=C2S(=O)(=O)[O-])C(=C1)S(=O)(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5302',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a member of the class of 1,3-thiazoles that is 4-(1,3-thiazol-2-yl)piperidine in which the piperidine amino group is substituted by a [3,5-bis(difluoromethyl)-1H-pyrazol-1-yl]acetyl group and position 4 of the thiazole ring is substituted by a 5-{2-chloro-6-[(methylsulfonyl)oxy]phenyl}-4,5-dihydro-1,2-oxazol-3-yl group. It is an organofluorine compound, a member of 1,3-thiazoles, a N-acylpiperidine, an isoxazoline, a member of pyrazoles, a tertiary carboxamide, a member of monochlorobenzenes and a methanesulfonate ester.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CS(=O)(=O)OC1=C(C(=CC=C1)Cl)C2CC(=NO2)C3=CSC(=N3)C4CCN(CC4)C(=O)CN5C(=CC(=N5)C(F)F)C(F)F'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5211',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a trisaccharide that is beta-D-galactopyranose in which the hydroxy groups at positions 2 and 4 have been converted to the corresponding beta-D-galactopyranoside and beta-D-glucopyranoside, respectively. It is a trisaccharide, a beta-D-glucoside and a beta-D-galactoside. It derives from a beta-D-Galp-(1->2)-beta-D-Galp.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@@H]([C@H]([C@@H](O1)O[C@H]2[C@H](O[C@H]([C@@H]([C@H]2O)O[C@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_23576',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a tetracyclic diterpenoid that has an tetradecahydro-8,11a-methanocyclohepta[a]naphthalene skeleton with two hydroxymethyl substituents at positions 4 and 9, two methyl substituents at positions 4 and 11b and two hydroxy substituents at positions 3 and 9. An antibiotic with antiviral and antimitotical properties. Aphidicolin is a reversible inhibitor of eukaryotic nuclear DNA replication. It has a role as an antimicrobial agent, an antiviral drug, an antineoplastic agent, an EC 2.7.7.7 (DNA-directed DNA polymerase) inhibitor, a DNA synthesis inhibitor, an apoptosis inducer and a fungal metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@]12CC[C@H]([C@@]([C@@H]1CC[C@@H]3[C@@]24CC[C@@]([C@H](C3)C4)(CO)O)(C)CO)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13462',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a 3-hydroxy monocarboxylic acid that is propionic acid in which one of the hydrogens attached to the terminal carbon is replaced by a hydroxy group. It has a role as an Escherichia coli metabolite and a human metabolite. It is a 3-hydroxy monocarboxylic acid and an omega-hydroxy fatty acid. It derives from a propionic acid. It is a conjugate acid of a 3-hydroxypropionate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CO)C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16640',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is an amino trisaccharide in which two galactose residues, linked alpha(1->4), are linked beta(1->3) to an N-acetylgalactosamine residue. It is an amino trisaccharide and a galactosamine oligosaccharide.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1O)CO)O)O[C@H]2[C@@H]([C@H]([C@H]([C@H](O2)CO)O[C@@H]3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15351',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a diterpene alkaloid with formula C25H41NO9 that is isolated from several Aconitum species. It has a role as a plant metabolite, a human urinary metabolite, a NF-kappaB inhibitor and a xenobiotic. It is a bridged compound, a diterpene alkaloid, an organic heteropolycyclic compound, a polyether, a tertiary amino compound, a pentol, a secondary alcohol and a tertiary alcohol. It derives from a hydride of an aconitane.\\nThe corresponding SMILES representation is:\\nCCN1C[C@@]2([C@@H](C[C@@H]([C@@]34[C@@H]2[C@H]([C@@H](C31)[C@@]5([C@@H]6[C@H]4C[C@@]([C@@H]6O)([C@H]([C@@H]5O)OC)O)O)OC)OC)O)COC\\nThe natural language question is: The molecule is a member of the class of pterocarpans that is (6aR,11aR)-pterocarpan substituted by a hydroxy group at position 9, methoxy groups at position 1 and 3 and a prenyl group at position 2. It has been isolated from Glycyrrhiza uralensis. It has a role as a plant metabolite. It is a member of phenols and an aromatic ether. It derives from a (6aR,11aR)-pterocarpan.\\nThe corresponding SMILES representation is:\\nCC(=CCC1=C(C=C2C(=C1OC)[C@H]3[C@@H](CO2)C4=C(O3)C=C(C=C4)O)OC)C\\nThe natural language question is: The molecule is a benzenetriol that is benzophenone in which one of the phenyl groups is substituted at by hydroxy groups at positions 2, 4, and 6. It is a benzenetriol and a hydroxybenzophenone. It is a conjugate acid of a 2,4,6-trihydroxybenzophenone(1-).\\nThe corresponding SMILES representation is:\\nC1=CC=C(C=C1)C(=O)C2=C(C=C(C=C2O)O)O\\nThe natural language question is: The molecule is a razoxane. It has a role as a chelator, an antineoplastic agent, a cardiovascular drug and an immunosuppressive agent.\\nThe corresponding SMILES representation is:\\nC[C@@H](CN1CC(=O)NC(=O)C1)N2CC(=O)NC(=O)C2\\nNext, you will be given a sample for test.The natural language question is: The molecule is a phenolate anion obtained by deprotonation of the 4-hydroxy group of deoxyherqueinone. It is the major microspecies at pH 7.3. It has a role as an antibacterial agent and a fungal metabolite. It is a conjugate base of a deoxyherqueinone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]1C(C2=C(C3=C4C(=C2O1)C(=CC(=O)C4=C(C(=C3O)OC)O)C)[O-])(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13380',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a hydrocarbyl anion. It is a conjugate base of a methane. It is a conjugate acid of a methanediide.\\nThe corresponding SMILES representation is:\\n[CH3-]\\nThe natural language question is: The molecule is an acyl-CoA(4-) obtained by deprotonation of the phosphate and diphosphate OH groups of L-firefly luciferyl-CoA; major species at pH 7.3. It is a conjugate base of a L-firefly luciferyl-CoA.\\nThe corresponding SMILES representation is:\\nCC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])[C@H](C(=O)NCCC(=O)NCCSC(=O)[C@@H]4CSC(=N4)C5=NC6=C(S5)C=C(C=C6)O)O\\nThe natural language question is: The molecule is an acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of oscr#38. It derives from an oscr#38. It is a conjugate acid of an oscr#38-CoA(4-).\\nThe corresponding SMILES representation is:\\nC[C@H]1[C@@H](C[C@H]([C@@H](O1)OCCCCCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O)O)O\\nThe natural language question is: The molecule is a dihydropterin that is 2-amino-6-hydroxymethyl-7,8-dihydropteridin-4-one with two methyl substituents at position 7. It has a role as a metabolite. It derives from a 2-amino-6-(hydroxymethyl)-7,8-dihydropteridin-4-one.\\nThe corresponding SMILES representation is:\\nCC1(C(=NC2=C(N1)N=C(NC2=O)N)CO)C\\nNext, you will be given a sample for test.The natural language question is: The molecule is an indolyl carbohydrate that is the alpha-D-glucuronide of indoxyl in which the indole moiety is substituted at positions 4 and 5 by chlorine and bromine, respectively. It is an organochlorine compound, an organobromine compound, an indolyl carbohydrate, an alpha-D-glucosiduronic acid and a monosaccharide derivative. It derives from an indoxyl.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=C(C2=C1NC=C2O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)O)O)O)O)Cl)Br'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14634',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a member of the class of pyrrolidin-2-ones that is pyrrolidin-2-one in which the 3-pro-S-hydrogen is substituted by a 2-methoxy-2-oxoethyl group, while the 5-pro-S-hydrogen is substituted by a ({4'-[N-(methoxycarbonyl)carbamimidoyl]biphenyl-4-yl}oxy)methyl group. It is an orally active prodrug of fradafiban, a figrinogen receptor antagonist. It has a role as a prodrug and a platelet glycoprotein-IIb/IIIa receptor antagonist. It is a member of pyrrolidin-2-ones and a methyl ester. It derives from a fradafiban.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC(=O)C[C@@H]1C[C@H](NC1=O)COC2=CC=C(C=C2)C3=CC=C(C=C3)/C(=N/C(=O)OC)/N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13940',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a polyunsaturated fatty acyl-CoA(4-) arising from deprotonation of the phosphate and diphosphate functions of (19Z,22Z,25Z,28Z)-tetratriacontatetraenoyl-CoA. It is a polyunsaturated fatty acyl-CoA(4-), a very long-chain acyl-CoA(4-) and a 3-substituted propionyl-CoA(4-). It is a conjugate base of a (19Z,22Z,25Z,28Z)-tetratriacontatetraenoyl-CoA.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6682',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a glycopeptide consisting of the Ara6 epitope attached to an Ala-Asn-Ser-Ser-Phe-Ala-Pro-NH2 heptapeptide via a [(2-hydroxyethoxy)imino]acetyl linker. It contains a beta-D-Araf-(1->2)-alpha-D-Araf-(1->5)-[beta-D-Araf-(1->2)-alpha-D-Araf-(1->3)]-alpha-D-Araf-(1->5)-alpha-D-Araf-yl group.\\nThe corresponding SMILES representation is:\\nC[C@@H](C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CO)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H](C)C(=O)N2CCC[C@H]2C(=O)N)NC(=O)/C=N/OCCO[C@@H]3[C@H]([C@@H]([C@H](O3)CO[C@@H]4[C@H]([C@@H]([C@H](O4)CO[C@@H]5[C@H]([C@@H]([C@H](O5)CO)O)O[C@H]6[C@H]([C@@H]([C@H](O6)CO)O)O)O[C@@H]7[C@H]([C@@H]([C@H](O7)CO)O)O[C@H]8[C@H]([C@@H]([C@H](O8)CO)O)O)O)O)O\\nThe natural language question is: The molecule is a (6E)-7-[3-(4-fluorophenyl)-1-(propan-2-yl)-1H-indol-2-yl]-3,5-dihydroxyhept-6-enoic acid diastereoisomer in which both chiral centres have S configuration. It is an enantiomer of a (3R,5R,6E)-7-[3-(4-fluorophenyl)-1-(propan-2-yl)-1H-indol-2-yl]-3,5-dihydroxyhept-6-enoic acid.\\nThe corresponding SMILES representation is:\\nCC(C)N1C2=CC=CC=C2C(=C1/C=C/[C@H](C[C@@H](CC(=O)O)O)O)C3=CC=C(C=C3)F\\nThe natural language question is: The molecule is needed for the new reaction: trans,octacis-decaprenyl phosphate + 5-phospho-alpha-D-ribose 1-diphosphate = trans,octacis-decaprenylphospho-beta-D-ribofuranose 5-phosphate + diphosphate It is a conjugate base of a trans,octacis-decaprenylphospho-beta-D-ribofuranose 5-phosphate.\\nThe corresponding SMILES representation is:\\nCC(=CCC/C(=C/CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\CC/C(=C\\\\\\\\COP(=O)([O-])O[C@H]1[C@@H]([C@@H]([C@H](O1)COP(=O)([O-])[O-])O)O)/C)/C)/C)/C)/C)/C)/C)/C)/C)C\\nThe natural language question is: The molecule is a quinolone that is pefloxacin in which the ethyl group at position 1 of the quinolone has been replaced by a p-fluorophenyl group. A broad-spectrum antibiotic effective against both Gram-positive and Gram-negative bacteria, it is used (usually as the monohydrochloride salt) for the treatment of bacterial infections in dogs. It has a role as an antibacterial drug and a Mycoplasma genitalium metabolite. It is a quinolone, a N-alkylpiperazine, a N-arylpiperazine, a quinolone antibiotic, a fluoroquinolone antibiotic, a member of monofluorobenzenes and a monocarboxylic acid.\\nThe corresponding SMILES representation is:\\nCN1CCN(CC1)C2=C(C=C3C(=C2)N(C=C(C3=O)C(=O)O)C4=CC=C(C=C4)F)F\\nNext, you will be given a sample for test.The natural language question is: The molecule is the conjugate base of 3'-O-methyltricetin arising from selective deprotonation of the 7-OH position; major species at pH 7.3. It is a conjugate base of a 3'-O-methyltricetin.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'COC1=CC(=CC(=C1[O-])O)C2=CC(=O)C3=C(C=C(C=C3O2)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13921',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is an oxopurine that is xanthine in which the hydrogen attached to the nitrogen at position 7 is replaced by a methyl group. It is an intermediate metabolite in the synthesis of caffeine. It has a role as a plant metabolite, a human xenobiotic metabolite and a mouse metabolite. It is an oxopurine and a purine alkaloid. It derives from a 7H-xanthine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN1C=NC2=C1C(=O)NC(=O)N2'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_10936',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is the simplest member of the class of neoflavans, that is 3,4-dihydro-2H-1-benzopyran substituted by a phenyl group at position 4.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1COC2=CC=CC=C2C1C3=CC=CC=C3'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6141',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a chromenol that is a derivative of filicinic acid and is isolated from the stems and leaves of Hypericum drummondii. It has been found to exhibit antibacterial activity against Gram-positive bacteria Staphylococcus aureus and Bacillus subtilis and the acid fast bacterium Mycobacterium smegmatis. It has a role as a metabolite and an antibacterial agent. It is a chromenol, an enol, an enone, a methyl ketone and an aromatic ketone. It derives from a filicinic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CCC1(C(=C(C(=C(C1=O)C(=O)C)O)CC2=C3C(=C(C(=C2O)C(=O)C)O)C=CC(O3)(C)C)O)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15607',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a penicillin where the side-chain N-acyl group is specified as allylmercaptoacetyl. Antibiotic isolated from Penicillium chrysogenum. It has a role as a Penicillium metabolite.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CSCC=C)C(=O)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_8576',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a glycinamide ribonucleotide having a phosphate group at the 5-position and a formyl group on the glycine nitrogen. It is a conjugate acid of a N(2)-formyl-N(1)-(5-phospho-D-ribosyl)glycinamide(2-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]1[C@H]([C@H](C(O1)NC(=O)CNC=O)O)O)OP(=O)(O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_28564',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is an unsaturated fatty acyl-CoA that results from the formal condensation of the thiol group of coenzyme A with the carboxy group of (20Z,23Z,26Z,29Z,32Z)-octatriacontapentaenoic acid. It is an unsaturated fatty acyl-CoA and an ultra-long-chain fatty acyl-CoA. It derives from a (20Z,23Z,26Z,29Z,32Z)-octatriacontapentaenoic acid. It is a conjugate acid of a (20Z,23Z,26Z,29Z,32Z)-octatriacontapentaenoyl-CoA(4-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCCCCCCCCCCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21184',\n", + " 'prompt': 'Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is a tetracarboxylic acid anion that is the conjugate base of gadoteric acid. It is a conjugate base of a gadoteric acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1CN(CCN(CCN(CCN1CC(=O)[O-])CC(=O)[O-])CC(=O)[O-])CC(=O)[O-].[Gd+3]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9238',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a member of the class of pyranopyrroles with formula C17H21NO4, originally isolated from Aspergillus niger. It has a role as an Aspergillus metabolite and a marine metabolite. It is a gamma-lactam, an enol, a pyranopyrrole and a cyclic ketone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C/C=C/C1=C(C(=O)C2=C(O1)C(NC2=O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20806',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is an organochlorine pesticide having a 3,6-dichlorinated 4-aminopicolinic acid structure. It has a role as a herbicide. It is an organochlorine pesticide, a member of pyridines and an aromatic amine. It derives from a picolinic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(C(=C(N=C1Cl)C(=O)O)Cl)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24450',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a oxa fatty acid anion and the conjugate base of colnelenic acid, arising from deprotonation of the carboxylic acid group. It is a straight-chain fatty acid anion, a long-chain fatty acid anion, an oxa fatty acid anion and a polyunsaturated fatty acid anion. It is a conjugate base of a colnelenic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\O/C=C/CCCCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14554',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is the spermidine amide of glutathione. It has a role as an Escherichia coli metabolite. It derives from a spermidine. It is a conjugate base of a glutathionylspermidinium(2+).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C(CCNCCCNC(=O)CNC(=O)[C@H](CS)NC(=O)CC[C@@H](C(=O)O)N)CN'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9776',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a resin glycoside that is the pentasaccharide derivative of jalapinolic acid. It has been isolated from Calystegia soldanella. It has a role as a metabolite. It is a macrocyclic lactone, a resin glycoside and a pentasaccharide derivative. It derives from a tiglic acid and a jalapinolic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC[C@H]1CCCCCCCCCC(=O)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC[C@@H]3[C@H]([C@@H]([C@H]([C@@H](O3)O[C@H]4[C@@H]([C@H](O[C@H]([C@@H]4O[C@H]5[C@@H]([C@@H]([C@H]([C@@H](O5)C)OC(=O)/C(=C/C)/C)O)OC(=O)[C@@H](C)[C@H](C)O)O[C@@H]6[C@H]([C@@H]([C@H](O[C@H]6O1)C)O)O)CO)O)O)OC(=O)[C@@H](C)CC)OC(=O)[C@@H](C)CC)CO)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1343',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is an organophosphate oxoanion arising from deprotonation of the phosphate OH groups of D-erythro-1-(imidazol-4-yl)glycerol 3-phosphate; major species at pH 7.3. It has a role as a Saccharomyces cerevisiae metabolite. It is a conjugate base of a D-erythro-1-(imidazol-4-yl)glycerol 3-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=C(NC=N1)[C@@H]([C@@H](COP(=O)([O-])[O-])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_9152',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a HETE having a 5-hydroxy group and (6E)-, (8Z)-, (11Z)- and (14Z)-double bonds. It has a role as a mouse metabolite. It is a conjugate acid of a 5-HETE(1-).\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C=C\\\\\\\\C(CCCC(=O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12510',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a pentacyclic triterpenoid that is hopane in which a hydrogen attached to C-29 is replaced by a 2-hydroxyethyl group and in which the stereochemistry at C-17 is inverted to 17alphaH. It is a pentacyclic triterpenoid and a primary alcohol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCCO)[C@H]1CC[C@]2([C@@H]1CC[C@@]3([C@@H]2CC[C@H]4[C@]3(CC[C@@H]5[C@@]4(CCCC5(C)C)C)C)C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14847',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a pyrimidine derivative having amino substituents in the 2- and 6-positions, a hydroxy substituent at the 4-position and an N-methylformamido group at the 5-position. It is an aminopyrimidine, a formamidopyrimidine and a hydroxypyrimidine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CN(C=O)C1=C(N=C(NC1=O)N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26759',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a sesquiterpene that is 2-methylcyclohexa-1,3-diene in which a hydrogen at the 5 position is substituted (R configuration) by a 6-methyl-hept-5-en-2-yl group (R configuration). 7-Epizingiberene is a specific sesquiterpene with toxic and repellent properties that is produced and stored in glandular trichomes. It has a role as an insect repellent and a semiochemical. It is a sesquiterpene and a cycloalkene.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=CC[C@@H](C=C1)[C@H](C)CCC=C(C)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_11689',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a zwtterion obtained by transfer of a proton from the 2-hydroxy to the primary amino group of 4-amino-4-de(dimethylamino)anhydrotetracycline. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a tautomer of a 4-amino-4-de(dimethylamino)anhydrotetracycline.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC1=C2C=CC=C(C2=C(C3=C1C[C@H]4[C@@H](C(=O)C(=C([C@]4(C3=O)O)[O-])C(=O)N)[NH3+])O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_26026',\n", + " 'prompt': 'Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.The natural language question is: The molecule is a quinone imine having amino substituents in the 2- and 5-positions and 4-aminophenyl substituents on both of the imine nitrogens. It is a trimer formed from 1,4-phenylenediamine. It has a role as a mutagen and an allergen. It derives from a 1,4-phenylenediamine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1N)N=C2C=C(C(=NC3=CC=C(C=C3)N)C=C2N)N'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_22094',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a cytochalasan alkaloid found in Chaetomium globosum. It has a role as a Chaetomium metabolite. It is a cytochalasan alkaloid, a member of indoles and a macrocycle.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]\\\\\\\\1C/C=C/[C@H]2[C@@H](C(=C)[C@H]([C@@H]3[C@@]2(C(=O)CCC(=O)C(=O)/C(=C1)/C)C(=O)N[C@H]3CC4=CNC5=CC=CC=C54)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_20675',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a branched-chain hydroxy fatty acid consisting of heptanoic acid with an isopropenyl group at the 3-position and the hydroxy group at the 6-position. It is a hydroxy fatty acid, a medium-chain fatty acid and a branched-chain fatty acid. It is a conjugate acid of a 6-hydroxy-3-isopropenylheptanoate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(CCC(CC(=O)O)C(=C)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_5545',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is a gallate ester resulting from the formal condensation of gallic acid with the (5R)-hydroxy group of (-)-quinic acid (i.e. the hydroxy group on the same side of the cyclohexane ring as the carboxy group). It is a gallate ester, a monocarboxylic acid and a tertiary alcohol. It derives from a gallic acid and a (-)-quinic acid.\\nThe corresponding SMILES representation is:\\nC1[C@H]([C@H]([C@@H](C[C@@]1(C(=O)O)O)OC(=O)C2=CC(=C(C(=C2)O)O)O)O)O\\nThe natural language question is: The molecule is a beta-hydroxy ketone that is butan-2-one substituted by a hydroxy group at position 4. It is a beta-hydroxy ketone and a methyl ketone. It derives from a butan-2-one.\\nThe corresponding SMILES representation is:\\nCC(=O)CCO\\nThe natural language question is: The molecule is a beta-bitter acid(1-) that is the conjugate base of colupulone, obtained by deprotonation of one of the enolic hydroxy groups. It is the major microspecies at pH 7.3 (according to Marvin v 6.2.0.). It is a conjugate base of a colupulone.\\nThe corresponding SMILES representation is:\\nCC(C)C(=O)C1=C(C(=C(C(C1=O)(CC=C(C)C)CC=C(C)C)O)CC=C(C)C)[O-]\\nThe natural language question is: The molecule is a UDP-N-acetyl-D-galactosamine 4-sulfate in which the anomeric centre of the galactosamine fragment has alpha-configuration. It is a conjugate acid of an UDP-N-acetyl-alpha-D-galactosamine 4-sulfate(3-).\\nThe corresponding SMILES representation is:\\nCC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H]([C@H]([C@@H](O2)N3C=CC(=O)NC3=O)O)O)CO)OS(=O)(=O)O)O\\nNext, you will be given a sample for test.The natural language question is: The molecule is a phosphatidylcholine 28:1 in which the acyl groups specified at positions 1 and 2 are decanoyl and (9Z-octadecenoyl respectively. It is a phosphatidylcholine 28:1 and a decanoate ester. It derives from an oleic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(=O)CCCCCCC/C=C\\\\\\\\CCCCCCCC'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3126',\n", + " 'prompt': \"Transform the verbal description of the molecule into its SMILES representation.The natural language question is: The molecule is an apo carotenoid compound arising from oxidative degradation of the beta,beta-carotene skeleton at the 8'- and 13-positions. It is a dialdehyde, an enal and an apo carotenoid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C/C(=C\\\\\\\\C=C\\\\\\\\C=O)/C=C/C=C(\\\\\\\\C)/C=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_27673',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a homodetic cyclic peptide composed from two units of three glycyl and three N(5)-acetyl-N(5)-hydroxy-L-ornithyl residues. It has a role as a siderophore. It is a homodetic cyclic peptide and a macrocycle.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=O)N(CCC[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)NCC(=O)NCC(=O)NCC(=O)N1)CCCN(C(=O)C)O)CCCN(C(=O)C)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16370',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is a gibberellin carboxylic acid anion that is the conjugate base of gibberellin A9, obtained by deprotonation of the carboxy group. It is a conjugate base of a gibberellin A9.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC12CCC[C@@]3([C@@H]1[C@@H]([C@]45[C@H]3CC[C@H](C4)C(=C)C5)C(=O)[O-])OC2=O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_21307',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is the conjugate base of sphingosine 1-phosphate having an anionic phosphate group and a protonated amino group; major species at pH 7.3. It has a role as a human metabolite. It is a conjugate base of a sphingosine 1-phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CCCCCCCCCCCCC/C=C/[C@H]([C@H](COP(=O)([O-])[O-])[NH3+])O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_1407',\n", + " 'prompt': 'Obtain the SMILES representation by processing the natural language description of the molecule.The natural language question is: The molecule is a C21-steroid in which a pregnane skeleton carries a beta-hydroxy group at position 3, an alpha-hydroxy group at position 20 and a double bond between positions 4 and 5. It is a C21-steroid, a 3beta-hydroxy steroid and a 20-hydroxy steroid. It derives from a progesterone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@@H]([C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CCC4=C[C@H](CC[C@]34C)O)C)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13408',\n", + " 'prompt': 'Generate the SMILES (Simplified Molecular Input Line Entry System) representation from the natural language description of the molecular.The natural language question is: The molecule is a stilbenoid that is the (+)-trans-stereoisomer of epsilon-viniferin, obtained by cyclodimerisation of trans-resveratrol. It is a member of 1-benzofurans, a polyphenol and a stilbenoid. It derives from a trans-resveratrol. It is an enantiomer of a (-)-trans-epsilon-viniferin.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1/C=C/C2=C3[C@@H]([C@H](OC3=CC(=C2)O)C4=CC=C(C=C4)O)C5=CC(=CC(=C5)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_12167',\n", + " 'prompt': 'Convert the natural language description of the molecule into its corresponding SMILES representation.The natural language question is: The molecule is a norlignan that is a derivative of sugiresinol in which the aromatic ring B has an additional hydroxy substituent. It derives from a sugiresinol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1[C@H]([C@@H](CO[C@H]1C2=CC=C(C=C2)O)O)C3=CC(=C(C=C3)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_16436',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is an organic anion resulting from the deprotonation of the phosphinic acid group of (2R)-glufosinate zwitterion. It is a conjugate base of a (2R)-glufosinate zwitterion.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CP(=O)(CC[C@H](C(=O)[O-])[NH3+])[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_4714',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is a tetracosahexaenoate that is the conjugate base of (6Z,9Z,12Z,15Z,18Z,21Z)-tetracosahexaenoic acid, obtained by deprotonation of the carboxy group; major species at pH 7.3. It is a conjugate base of a (6Z,9Z,12Z,15Z,18Z,21Z)-tetracosahexaenoic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\C/C=C\\\\\\\\CCCCC(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_24831',\n", + " 'prompt': \"Obtain the SMILES code by converting the verbal description of the molecule into its appropriate representation.\\nThe followings are four examples for reference.\\nThe natural language question is: The molecule is an organosulfonate oxoanion obtained by deprotonation of the sulfonic acid group of acetyltaurine. It has a role as a human blood serum metabolite and a human urinary metabolite. It is a conjugate base of an acetyltaurine.\\nThe corresponding SMILES representation is:\\nCC(=O)NCCS(=O)(=O)[O-]\\nThe natural language question is: The molecule is the 4-O-feruloyl derivative of D-quinic acid. It is a quinic acid and an enoate ester. It derives from a (-)-quinic acid and a ferulic acid. It is a conjugate acid of a 4-O-feruloyl-D-quinate.\\nThe corresponding SMILES representation is:\\nCOC1=C(C=CC(=C1)/C=C/C(=O)OC2[C@@H](CC(C[C@H]2O)(C(=O)O)O)O)O\\nThe natural language question is: The molecule is a 2',3'-cyclic purine nucleotide. It has a role as an Escherichia coli metabolite. It is a conjugate acid of a 2',3'-cyclic AMP(1-).\\nThe corresponding SMILES representation is:\\nC1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@H]4[C@@H]([C@H](O3)CO)OP(=O)(O4)O)N\\nThe natural language question is: The molecule is a pyruvic acid derivative having a 3-fluoro substituent. It is a 2-oxo monocarboxylic acid and an organofluorine compound. It derives from a pyruvic acid. It is a conjugate acid of a 3-fluoropyruvate.\\nThe corresponding SMILES representation is:\\nC(C(=O)C(=O)O)F\\nNext, you will be given a sample for test.The natural language question is: The molecule is a quinolinemonocarboxylate that is the conjugate base of kynurenic acid It has a role as a human metabolite. It is a conjugate base of a kynurenic acid.\\nThe corresponding SMILES representation is:\\n\",\n", + " 'completion': 'C1=CC=C2C(=C1)C(=O)C=C(N2)C(=O)[O-]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_15172',\n", + " 'prompt': 'Convert the molecular description in natural language into its SMILES representation.The natural language question is: The molecule is an organic disulfide in which the disulfide bond links two units of captopril. It is a secondary metabolite of captopril. It has a role as a metabolite. It derives from a captopril.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H](CSSCC(C)C(=O)N1CCC[C@H]1C(=O)O)C(=O)N2CCC[C@H]2C(=O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13284',\n", + " 'prompt': 'Translate the molecular description in natural language to its SMILES representation.The natural language question is: The molecule is alkaline earth metal atom with atomic number 4. It has a role as a carcinogenic agent, an adjuvant and an epitope. It is an alkaline earth metal atom and a metal allergen.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[Be]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_25427',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is an alkylglucosinolate resulting from the removal of the proton of the hydrogen sulfate group of any omega-[(methylsulfinyl)alkyl]glucosinolic acid. It is a conjugate base of an omega-[(methylsulfinyl)alkyl]glucosinolic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CS(=O)CCC/C(=N/OS(=O)(=O)[O-])/S[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_19244',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the stable isotope of beryllium with relative atomic mass 9.012182, 100 atom percent natural abundance and nuclear spin 3/2.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[9Be]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_13474',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is sodium fluorophosphate, commonly abbreviated MFP, is an inorganic compound with the chemical formula Na2PO3F. Typical for a salt, MFP is odourless, colourless, and water-soluble. This salt is an ingredient in toothpastes. It has a role as an antibacterial agent. It is an inorganic sodium salt and an inorganic phosphate.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': '[O-]P(=O)([O-])F.[Na+].[Na+]'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_3360',\n", + " 'prompt': 'Derive the SMILES notation from the verbal depiction of the molecule.The natural language question is: The molecule is a lyxonic acid that has L-configuration. It has a role as a human urinary metabolite and a rat metabolite. It is a conjugate acid of a L-lyxonate. It is an enantiomer of a D-lyxonic acid.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C([C@@H]([C@H]([C@H](C(=O)O)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_29614',\n", + " 'prompt': 'Generate the SMILES code from the verbal description of the molecule.The natural language question is: The molecule is a kaempferol O-glucoside that is kaempferol attached to a beta-D-glucofuranosyl moiety at position 3 via a glycosidic linkage. It has a role as a plant metabolite. It is a kaempferol O-glucoside, a monosaccharide derivative and a trihydroxyflavone.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C1=CC(=CC=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C@H]4[C@@H]([C@H]([C@H](O4)[C@@H](CO)O)O)O)O'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_6050',\n", + " 'prompt': 'Generate the SMILES code based on the natural language explanation of the molecule.The natural language question is: The molecule is a C50 carotenoid that is an intermediate in the biosynthesis of bacterioruberin, a red-coloured pigment found in several Halobacterium and Haloarcula species. It has a role as a bacterial metabolite. It is a C50 carotenoid, a tertiary alcohol and a diol.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'CC(=CC[C@@H](CC/C(=C/C=C/C(=C/C=C/C(=C/C=C/C=C(\\\\\\\\C)/C=C/C=C(\\\\\\\\C)/C=C/C=C(\\\\\\\\C)/C=C/[C@H](CC=C(C)C)C(C)(C)O)/C)/C)/C)C(C)(C)O)C'},\n", + " {'dataset': 'chebi',\n", + " 'id': 'chebi_14381',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is a quaternary ammonium ion that is the conjugate acid of 3-dehydrocarnitine. It has a role as a human metabolite. It derives from a carnitinium. It is a conjugate acid of a 3-dehydrocarnitine.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[N+](C)(C)CC(=O)CC(=O)O'},\n", + " ...]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "88669" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset': 'chebi',\n", + " 'id': 'chebi_7655',\n", + " 'prompt': 'Render the natural language description of the molecule into the corresponding SMILES representation.The natural language question is: The molecule is the conjugate base of pelargonidin 3-O-rutinoside; major species at pH 7.3. It is a conjugate base of a pelargonidin 3-O-rutinoside.\\nThe corresponding SMILES representation is:\\n',\n", + " 'completion': 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=C(OC4=CC(=O)C=C(C4=C3)O)C5=CC=C(C=C5)O)O)O)O)O)O)O'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('Render the natural language description of the molecule into the '\n", + " 'corresponding SMILES representation.The natural language question is: The '\n", + " 'molecule is the conjugate base of pelargonidin 3-O-rutinoside; major species '\n", + " 'at pH 7.3. It is a conjugate base of a pelargonidin 3-O-rutinoside.\\n'\n", + " 'The corresponding SMILES representation is:\\n')\n", + "'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=C(OC4=CC(=O)C=C(C4=C3)O)C5=CC=C(C=C5)O)O)O)O)O)O)O'\n" + ] + } + ], + "source": [ + "\n", + "\n", + "import pprint\n", + "\n", + "pprint.pprint(data[0]['prompt'])\n", + "pprint.pprint(data[0]['completion'])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def demo():\n", + " random_number = random.randint(1, 80000) \n", + " print(random_number)\n", + " pprint.pprint(data[random_number]['prompt'])\n", + " pprint.pprint(data[random_number]['completion'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def demo():\n", + " random_number = random.randint(1, 80000) \n", + " print(random_number)\n", + " pprint.pprint(data[random_number]['prompt'])\n", + " print('-----------------------------------------------------')\n", + " pprint.pprint(data[random_number]['completion'])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "44144\n", + "('Rewrite the question as a series of logical functions.\\n'\n", + " 'Each logic form is seperated by \\\\u2227 . We define the following valid '\n", + " 'predicates. For geometric shapes, the predicates are: Point(), Line(), '\n", + " 'Angle(), Triangle(), Quadrilateral(), Parallelogram(), Square(), '\n", + " 'Rectangle(), Rhombus(), Trapezoid(), Kite(), Polygon(), Pentagon(), '\n", + " 'Hexagon(), Heptagon(), Octagon(), Circle(), Arc(), Sector(), Shape(). For '\n", + " 'unary geometric attributes, the predicates are: RightAngle(), Right(), '\n", + " 'Isosceles(), Equilateral(), Regular(), Red(), Blue(), Green(), Shaded(). For '\n", + " 'geometric attributes, we define: AreaOf(), PerimeterOf(), RadiusOf(), '\n", + " 'FiameterOf(), CircumferenceOf(), AltitudeOf(), HypotenuseOf(), SideOf(), '\n", + " 'WidthOf(), HeightOf(), LegOf(), BaseOf(), MedianOf(), IntersectionOf(), '\n", + " 'MeasureOf(), LengthOf(), ScaleFactorOf(). For binary geometric relations, '\n", + " 'the predicates are: PointLiesOnLine(), PointLiesOnCircle(), Parallel(), '\n", + " 'Perpendicular(), IntersectAt(), BisectsAngle(), Congruent(), Similar(), '\n", + " 'Tangent(), Secant(), CircumscribedTo(), InscribedIn(). For A-isXof-B '\n", + " 'geometric relations, the defined predicates are: IsMidpointOf(), '\n", + " 'IsCentroidOf(), IsIncenterOf(), IsRadiusOf(), IsDiameterOf(), '\n", + " 'IsMidsegmentOf(), IsChordOf(), IsSideOf(), IsHypotenuseOf(), '\n", + " 'IsPerpendicularBisectorOf(), IsAltitudeOf(), IsMedianOf(), IsBaseOf(), '\n", + " 'IsDiagonalOf(), IsLegOf(). For numerical attributes and relations, the '\n", + " 'predicates are: SinOf(), CosOf(), TanOf(), CotOf(), HalfOf(), SquareOf(), '\n", + " 'SqrtOf(), RatioOf(), SumOf(), AverageOf(), Add(), Mul(), Sub(), Div(), '\n", + " 'Pow(), Equals(), Find(), UseTheorem().The natural language question is: Find '\n", + " 'm \\\\angle Z.\\n'\n", + " 'The corresponding logic form for the question is:\\n')\n", + "-----------------------------------------------------\n", + "'Find(MeasureOf(Angle(Z)))'\n" + ] + } + ], + "source": [ + "demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11825\n", + "('Transform the natural language sentence into an AMR form by abstracting its '\n", + " 'meaning and relationships.\\n'\n", + " 'The input sentence is: Why would nimh continue with the focus on my '\n", + " 'statements, when he obviously agreed once he saw the pictures a la PDiddie?\\n'\n", + " 'The AMR is:\\n')\n", + "-----------------------------------------------------\n", + "('(c / cause-01\\n'\n", + " ' :ARG0 (a / amr-unknown)\\n'\n", + " ' :ARG1 (c2 / continue-01\\n'\n", + " ' :ARG0 (p / person :wiki -\\n'\n", + " ' :name (n / name :op1 \"Nimh\"))\\n'\n", + " ' :ARG1 (f / focus-01\\n'\n", + " ' :ARG0 p\\n'\n", + " ' :ARG2 (t / thing\\n'\n", + " ' :ARG1-of (s / state-01\\n'\n", + " ' :ARG0 (i / i)))))\\n'\n", + " ' :concession (o / obvious-01\\n'\n", + " ' :ARG1 (a2 / agree-01\\n'\n", + " ' :ARG0 p\\n'\n", + " ' :time (o2 / once\\n'\n", + " ' :op1 (s2 / see-01\\n'\n", + " ' :ARG0 p\\n'\n", + " ' :ARG1 (p2 / picture-01\\n'\n", + " ' :manner (p3 / person :wiki \"Sean_Combs\"\\n'\n", + " ' :name (n2 / name :op1 '\n", + " '\"PDiddie\"))))))))')\n" + ] + } + ], + "source": [ + "demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "69365\n", + "('Given the description of a problem scene, please transform the full part of '\n", + " 'the scene from natural language into PDDL file. Please use the following '\n", + " 'pre-defined atom format to form the PDDL: (Pzl3Qg ?r - robot ?x - room) '\n", + " 'means the robot r is in room x. (95GFun ?o - object ?x - room) means the '\n", + " 'object o is in room x. (YxX7E2 ?r - robot ?g - gripper) means the gripper g '\n", + " 'of the robot r is free. (6z4Tww ?r - robot ?o - object ?g - gripper) means '\n", + " 'the robot r is caryying the object o with its gripper g. An example problem '\n", + " 'in natural language is: \\n'\n", + " ' You control 1 robots, each robot has a left gripper and a right gripper.\\n'\n", + " 'There are 5 rooms and 6 balls.robot1 is in room4. \\n'\n", + " 'ball1 is in room5. ball2 is in room3. ball3 is in room4. ball4 is in room5. '\n", + " 'ball5 is in room4. ball6 is in room5. \\n'\n", + " \"The robots' grippers are free.\\n\"\n", + " 'Your goal is to transport the balls to their destinations.ball1 should be in '\n", + " 'room5.\\n'\n", + " 'ball2 should be in room3.\\n'\n", + " 'ball3 should be in room4.\\n'\n", + " 'ball4 should be in room5.\\n'\n", + " 'ball5 should be in room4.\\n'\n", + " 'ball6 should be in room5.\\n'\n", + " 'The corresponding PDDL file to the full part of this problem is: \\n'\n", + " ' (define (problem grippers1x5x6)\\n'\n", + " '(:domain gripper-strips)\\n'\n", + " '(:objects\\n'\n", + " 'robot1 - robot\\n'\n", + " 'rgripper1 lgripper1 - gripper\\n'\n", + " 'room1 room2 room3 room4 room5 - room\\n'\n", + " 'ball1 ball2 ball3 ball4 ball5 ball6 - object\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " '(:init\\n'\n", + " '(Pzl3Qg robot1 room4)\\n'\n", + " '(YxX7E2 robot1 rgripper1)\\n'\n", + " '(YxX7E2 robot1 lgripper1)\\n'\n", + " '(95GFun ball1 room5)\\n'\n", + " '(95GFun ball2 room3)\\n'\n", + " '(95GFun ball3 room4)\\n'\n", + " '(95GFun ball4 room5)\\n'\n", + " '(95GFun ball5 room4)\\n'\n", + " '(95GFun ball6 room5)\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " '(:goal\\n'\n", + " '(and\\n'\n", + " '(95GFun ball1 room1)\\n'\n", + " '(95GFun ball2 room4)\\n'\n", + " '(95GFun ball3 room4)\\n'\n", + " '(95GFun ball4 room3)\\n'\n", + " '(95GFun ball5 room4)\\n'\n", + " '(95GFun ball6 room3)\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " ')\\n'\n", + " ')\\n'\n", + " 'Now I have a new planning scene and its natural language description is:\\n'\n", + " 'You control 6 robots, each robot has a left gripper and a right gripper.\\n'\n", + " 'There are 1 rooms and 3 balls.robot1 is in room1. robot2 is in room1. robot3 '\n", + " 'is in room1. robot4 is in room1. robot5 is in room1. robot6 is in room1. \\n'\n", + " 'ball1 is in room1. ball2 is in room1. ball3 is in room1. \\n'\n", + " \"The robots' grippers are free.\\n\"\n", + " 'Your goal is to transport the balls to their destinations.ball1 should be in '\n", + " 'room1.\\n'\n", + " 'ball2 should be in room1.\\n'\n", + " 'ball3 should be in room1.\\n'\n", + " '\\n'\n", + " 'The PDDL file to the full part of this problem is:\\n')\n", + "-----------------------------------------------------\n", + "('(define (problem grippers6x1x3)\\n'\n", + " '(:domain gripper-strips)\\n'\n", + " '(:objects\\n'\n", + " 'robot1 robot2 robot3 robot4 robot5 robot6 - robot\\n'\n", + " 'rgripper1 lgripper1 rgripper2 lgripper2 rgripper3 lgripper3 rgripper4 '\n", + " 'lgripper4 rgripper5 lgripper5 rgripper6 lgripper6 - gripper\\n'\n", + " 'room1 - room\\n'\n", + " 'ball1 ball2 ball3 - object\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " '(:init\\n'\n", + " '(Pzl3Qg robot1 room1)\\n'\n", + " '(YxX7E2 robot1 rgripper1)\\n'\n", + " '(YxX7E2 robot1 lgripper1)\\n'\n", + " '(Pzl3Qg robot2 room1)\\n'\n", + " '(YxX7E2 robot2 rgripper1)\\n'\n", + " '(YxX7E2 robot2 lgripper1)\\n'\n", + " '(Pzl3Qg robot3 room1)\\n'\n", + " '(YxX7E2 robot3 rgripper1)\\n'\n", + " '(YxX7E2 robot3 lgripper1)\\n'\n", + " '(Pzl3Qg robot4 room1)\\n'\n", + " '(YxX7E2 robot4 rgripper1)\\n'\n", + " '(YxX7E2 robot4 lgripper1)\\n'\n", + " '(Pzl3Qg robot5 room1)\\n'\n", + " '(YxX7E2 robot5 rgripper1)\\n'\n", + " '(YxX7E2 robot5 lgripper1)\\n'\n", + " '(Pzl3Qg robot6 room1)\\n'\n", + " '(YxX7E2 robot6 rgripper1)\\n'\n", + " '(YxX7E2 robot6 lgripper1)\\n'\n", + " '(95GFun ball1 room1)\\n'\n", + " '(95GFun ball2 room1)\\n'\n", + " '(95GFun ball3 room1)\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " '(:goal\\n'\n", + " '(and\\n'\n", + " '(95GFun ball1 room1)\\n'\n", + " '(95GFun ball2 room1)\\n'\n", + " '(95GFun ball3 room1)\\n'\n", + " '\\n'\n", + " ')\\n'\n", + " ')\\n'\n", + " ')\\n')\n" + ] + } + ], + "source": [ + "demo()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}