RDKit
Open-source cheminformatics and machine learning.
SmilesWrite.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_SMILESWRITE_H_012020
12#define RD_SMILESWRITE_H_012020
13
14#include <string>
15#include <vector>
16#include <memory>
17#include <cstdint>
18#include <limits>
19
20namespace RDKit {
21class Atom;
22class Bond;
23class ROMol;
24
26 bool doIsomericSmiles =
27 true; /**< include stereochemistry and isotope information */
28 bool doKekule = false; /**< kekulize the molecule before generating the SMILES
29 and output single/double bonds. NOTE that the output
30 is not canonical and that this will thrown an
31 exception if the molecule cannot be kekulized. */
32 bool canonical = true; /**< generate canonical SMILES */
33 bool allBondsExplicit = false; /**< include symbols for all bonds */
34 bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
35 bool doRandom = false; /**< randomize the output order. The resulting SMILES
36 is not canonical */
37 int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
38 atom. The resulting SMILES is not canonical */
39};
40namespace SmilesWrite {
41
42enum CXSmilesFields : uint32_t {
46 CX_COORDS = 1 << 2,
47 CX_RADICALS = 1 << 3,
48 CX_ATOM_PROPS = 1 << 4,
49 CX_LINKNODES = 1 << 5,
51 CX_SGROUPS = 1 << 7,
52 CX_POLYMER = 1 << 8,
53 CX_ALL = 0x7fffffff
54};
55
56//! \brief returns the cxsmiles data for a molecule
58 const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
59
60//! \brief returns true if the atom number is in the SMILES organic subset
62
63//! \brief returns the SMILES for an atom
64/*!
65 \param atom : the atom to work with
66 \param doKekule : we're doing kekulized smiles (e.g. don't use
67 lower case for the atom label)
68 \param bondIn : the bond we came into the atom on (unused)
69 \param allHsExplicit : if true, hydrogen counts will be provided for every
70 atom.
71 \param isomericSmiles : if true, isomeric SMILES will be generated
72*/
74 bool doKekule = false,
75 const Bond *bondIn = nullptr,
76 bool allHsExplicit = false,
77 bool isomericSmiles = true);
78
79//! \brief returns the SMILES for a bond
80/*!
81 \param bond : the bond to work with
82 \param atomToLeftIdx : the index of the atom preceding \c bond
83 in the SMILES
84 \param doKekule : we're doing kekulized smiles (e.g. write out
85 bond orders for aromatic bonds)
86 \param allBondsExplicit : if true, symbols will be included for all bonds.
87*/
89 const Bond *bond, int atomToLeftIdx = -1, bool doKekule = false,
90 bool allBondsExplicit = false);
91} // namespace SmilesWrite
92
93//! \brief returns canonical SMILES for a molecule
95 const ROMol &mol, const SmilesWriteParams &params);
96
97//! \brief returns canonical SMILES for a molecule
98/*!
99 \param mol : the molecule in question.
100 \param doIsomericSmiles : include stereochemistry and isotope information
101 in the SMILES
102
103 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
104 this will throw an exception if the molecule cannot be kekulized.
105
106 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
107 The resulting SMILES is not, of course, canonical.
108 \param canonical : if false, no attempt will be made to canonicalize the
109 SMILES
110 \param allBondsExplicit : if true, symbols will be included for all bonds.
111 \param allHsExplicit : if true, hydrogen counts will be provided for every
112 atom.
113 */
114inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
115 bool doKekule = false, int rootedAtAtom = -1,
116 bool canonical = true,
117 bool allBondsExplicit = false,
118 bool allHsExplicit = false,
119 bool doRandom = false) {
121 ps.doIsomericSmiles = doIsomericSmiles;
122 ps.doKekule = doKekule;
123 ps.rootedAtAtom = rootedAtAtom;
124 ps.canonical = canonical;
125 ps.allBondsExplicit = allBondsExplicit;
126 ps.allHsExplicit = allHsExplicit;
127 ps.doRandom = doRandom;
128 return MolToSmiles(mol, ps);
129};
130
131//! \brief returns a vector of random SMILES for a molecule (may contain
132//! duplicates)
133/*!
134 \param mol : the molecule in question.
135 \param numSmiles : the number of SMILES to return
136 \param randomSeed : if >0, will be used to seed the random number generator
137 \param doIsomericSmiles : include stereochemistry and isotope information
138 in the SMILES
139 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
140 \param allBondsExplicit : if true, symbols will be included for all bonds.
141 \param allHsExplicit : if true, hydrogen counts will be provided for every
142 atom.
143 */
145 const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
146 bool doIsomericSmiles = true, bool doKekule = false,
147 bool allBondsExplicit = false, bool allHsExplicit = false);
148
149//! \brief returns canonical SMILES for part of a molecule
151 const ROMol &mol, const SmilesWriteParams &params,
152 const std::vector<int> &atomsToUse,
153 const std::vector<int> *bondsToUse = nullptr,
154 const std::vector<std::string> *atomSymbols = nullptr,
155 const std::vector<std::string> *bondSymbols = nullptr);
156
157//! \brief returns canonical SMILES for part of a molecule
158/*!
159 \param mol : the molecule in question.
160 \param atomsToUse : indices of the atoms in the fragment
161 \param bondsToUse : indices of the bonds in the fragment. If this is not
162 provided,
163 all bonds between the atoms in atomsToUse will be included
164 \param atomSymbols : symbols to use for the atoms in the output SMILES
165 \param bondSymbols : symbols to use for the bonds in the output SMILES
166 \param doIsomericSmiles : include stereochemistry and isotope information
167 in the SMILES
168 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
169 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
170 The resulting SMILES is not, of course, canonical.
171 \param canonical : if false, no attempt will be made to canonicalize the
172 SMILES
173 \param allBondsExplicit : if true, symbols will be included for all bonds.
174 \param allHsExplicit : if true, hydrogen counts will be provided for every
175 atom.
176 \param doRandom : generate a randomized smiles string by randomly choosing
177 the priority to follow in the DFS traversal. [default false]
178
179 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
180
181 */
182inline std::string MolFragmentToSmiles(
183 const ROMol &mol, const std::vector<int> &atomsToUse,
184 const std::vector<int> *bondsToUse = nullptr,
185 const std::vector<std::string> *atomSymbols = nullptr,
186 const std::vector<std::string> *bondSymbols = nullptr,
187 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
188 bool canonical = true, bool allBondsExplicit = false,
189 bool allHsExplicit = false) {
191 ps.doIsomericSmiles = doIsomericSmiles;
192 ps.doKekule = doKekule;
193 ps.rootedAtAtom = rootedAtAtom;
194 ps.canonical = canonical;
195 ps.allBondsExplicit = allBondsExplicit;
196 ps.allHsExplicit = allHsExplicit;
197 return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
198 bondSymbols);
199}
200
201//! \brief returns canonical CXSMILES for a molecule
203 const ROMol &mol, const SmilesWriteParams &ps,
204 std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL);
205
206//! \brief returns canonical CXSMILES for a molecule
207/*!
208 \param mol : the molecule in question.
209 \param doIsomericSmiles : include stereochemistry and isotope information
210 in the SMILES
211 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
212 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
213 The resulting SMILES is not, of course, canonical.
214 \param canonical : if false, no attempt will be made to canonicalize the
215 SMILES
216 \param allBondsExplicit : if true, symbols will be included for all bonds.
217 \param allHsExplicit : if true, hydrogen counts will be provided for every
218 atom.
219 */
220inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
221 bool doKekule = false, int rootedAtAtom = -1,
222 bool canonical = true,
223 bool allBondsExplicit = false,
224 bool allHsExplicit = false,
225 bool doRandom = false) {
227 ps.doIsomericSmiles = doIsomericSmiles;
228 ps.doKekule = doKekule;
229 ps.rootedAtAtom = rootedAtAtom;
230 ps.canonical = canonical;
231 ps.allBondsExplicit = allBondsExplicit;
232 ps.allHsExplicit = allHsExplicit;
233 ps.doRandom = doRandom;
234 return MolToCXSmiles(mol, ps);
235};
236
237//! \brief returns canonical CXSMILES for part of a molecule
239 const ROMol &mol, const SmilesWriteParams &params,
240 const std::vector<int> &atomsToUse,
241 const std::vector<int> *bondsToUse = nullptr,
242 const std::vector<std::string> *atomSymbols = nullptr,
243 const std::vector<std::string> *bondSymbols = nullptr);
244
245//! \brief returns canonical CXSMILES for part of a molecule
246/*!
247 \param mol : the molecule in question.
248 \param atomsToUse : indices of the atoms in the fragment
249 \param bondsToUse : indices of the bonds in the fragment. If this is not
250 provided,
251 all bonds between the atoms in atomsToUse will be included
252 \param atomSymbols : symbols to use for the atoms in the output SMILES
253 \param bondSymbols : symbols to use for the bonds in the output SMILES
254 \param doIsomericSmiles : include stereochemistry and isotope information
255 in the SMILES
256 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
257 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
258 The resulting SMILES is not, of course, canonical.
259 \param canonical : if false, no attempt will be made to canonicalize the
260 SMILES
261 \param allBondsExplicit : if true, symbols will be included for all bonds.
262 \param allHsExplicit : if true, hydrogen counts will be provided for every
263 atom.
264
265 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
266
267 */
268inline std::string MolFragmentToCXSmiles(
269 const ROMol &mol, const std::vector<int> &atomsToUse,
270 const std::vector<int> *bondsToUse = nullptr,
271 const std::vector<std::string> *atomSymbols = nullptr,
272 const std::vector<std::string> *bondSymbols = nullptr,
273 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
274 bool canonical = true, bool allBondsExplicit = false,
275 bool allHsExplicit = false) {
277 ps.doIsomericSmiles = doIsomericSmiles;
278 ps.doKekule = doKekule;
279 ps.rootedAtAtom = rootedAtAtom;
280 ps.canonical = canonical;
281 ps.allBondsExplicit = allBondsExplicit;
282 ps.allHsExplicit = allHsExplicit;
283 return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
284 bondSymbols);
285}
286
287} // namespace RDKit
288#endif
The class for representing atoms.
Definition: Atom.h:68
class for representing a bond
Definition: Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition: export.h:449
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx=-1, bool doKekule=false, bool allBondsExplicit=false)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, bool doKekule=false, const Bond *bondIn=nullptr, bool allHsExplicit=false, bool isomericSmiles=true)
returns the SMILES for an atom
Std stuff.
Definition: Abbreviations.h:18
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL)
returns canonical CXSMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule