RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2021 greg landrum, Rational Discovery LLC
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_MOLSUPPLIER_H
12#define RD_MOLSUPPLIER_H
13
14#include <RDGeneral/types.h>
15
16#include <string>
17#include <list>
18#include <memory>
19#include <vector>
20#include <iostream>
21#include <fstream>
22#include <GraphMol/ROMol.h>
24
25#ifdef RDK_BUILD_MAEPARSER_SUPPORT
26namespace schrodinger {
27namespace mae {
28class Reader;
29class Block;
30} // namespace mae
31} // namespace schrodinger
32#endif // RDK_BUILD_MAEPARSER_SUPPORT
33
34namespace RDKit {
35RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
36
37/*!
38//
39// Here are a couple of ways one can interact with MolSuppliers:
40//
41// 1) Lazy (ForwardIterator):
42// while(!supplier.atEnd()){
43// ROMol *mol = supplier.next();
44// if(mol){
45// do something;
46// }
47// }
48// 2) Random Access:
49// for(int i=0;i<supplier.length();i++){
50// ROMol *mol = supplier[i];
51// if(mol){
52// do something;
53// }
54// }
55//
56//
57*/
59 // this is an abstract base class to supply molecules one at a time
60 public:
62 virtual ~MolSupplier() {}
63 virtual void init() = 0;
64 virtual void reset() = 0;
65 virtual bool atEnd() = 0;
66 virtual ROMol *next() = 0;
67
68 virtual void close() {
69 if (df_owner) {
70 delete dp_inStream;
71 df_owner = false;
72 }
73 dp_inStream = nullptr;
74 }
75
76 private:
77 // disable automatic copy constructors and assignment operators
78 // for this class and its subclasses. They will likely be
79 // carrying around stream pointers and copying those is a recipe
80 // for disaster.
81 MolSupplier(const MolSupplier &);
82 MolSupplier &operator=(const MolSupplier &);
83
84 protected:
85 // stream to read the molecules from:
86 std::istream *dp_inStream = nullptr;
87 // do we own dp_inStream?
88 bool df_owner = false;
89 // opens a stream for reading and verifies that it can be read from.
90 // if not it throws an exception
91 // the caller owns the resulting stream
92 std::istream *openAndCheckStream(const std::string &filename) {
93 // FIX: this binary mode of opening file is here because of a bug in
94 // VC++ 6.0
95 // the function "tellg" does not work correctly if we do not open it this
96 // way
97 // Jan 2009: Confirmed that this is still the case in visual studio 2008
98 std::ifstream *strm =
99 new std::ifstream(filename.c_str(), std::ios_base::binary);
100 if ((!(*strm)) || strm->bad()) {
101 std::ostringstream errout;
102 errout << "Bad input file " << filename;
103 delete strm;
104 throw BadFileException(errout.str());
105 }
106
107 strm->peek();
108 if (strm->bad() || strm->eof()) {
109 std::ostringstream errout;
110 errout << "Invalid input file " << filename;
111 delete strm;
112 throw BadFileException(errout.str());
113 }
114 return static_cast<std::istream *>(strm);
115 }
116};
117
118// \brief a supplier from an SD file that only reads forward:
120 /*************************************************************************
121 * A lazy mol supplier from a SD file.
122 * - When new molecules are read using "next" their positions in the file are
123 *noted.
124 ***********************************************************************************/
125 public:
127
128 explicit ForwardSDMolSupplier(std::istream *inStream,
129 bool takeOwnership = true, bool sanitize = true,
130 bool removeHs = true,
131 bool strictParsing = false);
132
133 ~ForwardSDMolSupplier() override { close(); }
134
135 void init() override;
136 void reset() override;
137 ROMol *next() override;
138 bool atEnd() override;
139
140 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
141 bool getProcessPropertyLists() const { return df_processPropertyLists; }
142
143 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
144
145 protected:
146 virtual void checkForEnd();
148 virtual void readMolProps(ROMol *);
149 bool df_end = false;
150 int d_line = 0; // line number we are currently on
151 bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
152 bool df_processPropertyLists = true;
153 bool df_eofHitOnRead = false;
154};
155
156// \brief a lazy supplier from an SD file
158 /*************************************************************************
159 * A lazy mol supplier from a SD file.
160 * - When new molecules are read using "next" their positions in the file are
161 *noted.
162 * - A call to the "length" will automatically parse the entire file and
163 *cache all the mol
164 * block positions
165 * - [] operator is used to access a molecule at "idx", calling next
166 *following this will result
167 * in the next molecule after "idx"
168 ***********************************************************************************/
169
170 public:
171 SDMolSupplier() { init(); }
172
173 /*!
174 * \param fileName - the name of the SD file
175 * \param sanitize - if true sanitize the molecule before returning it
176 * \param removeHs - if true remove Hs from the molecule before returning it
177 * (triggers sanitization)
178 * \param strictParsing - if set to false, the parser is more lax about
179 * correctness
180 * of the contents.
181 */
182 explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
183 bool removeHs = true, bool strictParsing = true);
184
185 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
186 bool sanitize = true, bool removeHs = true,
187 bool strictParsing = true);
188
189 ~SDMolSupplier() override { close(); }
190 void init() override;
191 void reset() override;
192 ROMol *next() override;
193 bool atEnd() override;
194 void moveTo(unsigned int idx);
195 ROMol *operator[](unsigned int idx);
196 /*! \brief returns the text block for a particular item
197 *
198 * \param idx - which item to return
199 */
200 std::string getItemText(unsigned int idx);
201 unsigned int length();
202 void setData(const std::string &text, bool sanitize = true,
203 bool removeHs = true);
204 void setData(const std::string &text, bool sanitize, bool removeHs,
205 bool strictParsing);
206
207 /*! Resets our internal state and sets the indices of molecules in the stream.
208 * The client should be *very* careful about calling this method, as it's
209 *trivial
210 * to end up with a completely useless supplier.
211 *
212 * \param locs - the vector of stream positions.
213 *
214 * Note that this can be used not only to make reading selected molecules
215 *from a
216 * large SD file much faster, but it can also allow subsetting an SD file or
217 * rearranging the order of the molecules.
218 */
219 void setStreamIndices(const std::vector<std::streampos> &locs);
220
221 private:
222 void checkForEnd() override;
223 void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
224 int d_len = 0; // total number of mol blocks in the file (initialized to -1)
225 int d_last = 0; // the molecule we are ready to read
226 std::vector<std::streampos> d_molpos;
227};
228
229//! lazy file parser for Smiles tables
231 /**************************************************************************
232 * Lazy file parser for Smiles table file, similar to the lazy SD
233 * file parser above
234 * - As an when new molecules are read using "next" their
235 * positions in the file are noted.
236 * - A call to the "length" will autamatically parse the entire
237 * file and cache all the mol block positions
238 * - [] operator is used to access a molecule at "idx", calling
239 * next following this will result in the next molecule after
240 * "idx"
241 ***************************************************************************/
242 public:
243 /*!
244 * \param fileName - the name of smiles table file
245 * \param delimiter - delimiting characters between records on a each
246 * line NOTE that this is not a string, the tokenizer looks for
247 * the individual characters in delimiter, not the full string
248 * itself. So the default delimiter: " \t", means " " or "\t".
249 * \param smilesColumn - column number for the SMILES string (defaults
250 * to the first column)
251 * \param nameColumn - column number for the molecule name (defaults to
252 * the second column) If set to -1 we assume that no name is
253 * available for the molecule and the name is defaulted to the
254 * smiles string
255 * \param titleLine - if true, the first line is assumed to list the
256 * names of properties in order separated by 'delimiter'. It is
257 * also assume that the 'SMILES' column and the 'name' column
258 * are not specified here if false - no title line is assumed
259 * and the properties are recorded as the "columnX" where "X" is
260 * the column number
261 * \param sanitize - if true sanitize the molecule before returning it
262 */
263 explicit SmilesMolSupplier(const std::string &fileName,
264 const std::string &delimiter = " \t",
265 int smilesColumn = 0, int nameColumn = 1,
266 bool titleLine = true, bool sanitize = true);
268 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
269 const std::string &delimiter = " \t",
270 int smilesColumn = 0, int nameColumn = 1,
271 bool titleLine = true, bool sanitize = true);
272
273 ~SmilesMolSupplier() override { close(); }
274 void setData(const std::string &text, const std::string &delimiter = " ",
275 int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
276 bool sanitize = true);
277 void init() override;
278 void reset() override;
279 ROMol *next() override;
280 bool atEnd() override;
281 void moveTo(unsigned int idx);
282 ROMol *operator[](unsigned int idx);
283 /*! \brief returns the text block for a particular item
284 *
285 * \param idx - which item to return
286 */
287 std::string getItemText(unsigned int idx);
288 unsigned int length();
289
290 private:
291 ROMol *processLine(std::string inLine);
292 void processTitleLine();
293 std::string nextLine();
294 long int skipComments();
295 void checkForEnd();
296
297 bool df_end = false; // have we reached the end of the file?
298 int d_len = 0; // total number of smiles in the file
299 int d_next = 0; // the molecule we are ready to read
300 int d_line = 0; // line number we are currently on
301 std::vector<std::streampos>
302 d_molpos; // vector of positions in the file for molecules
303 std::vector<int> d_lineNums;
304 std::string d_delim; // the delimiter string
305 bool df_sanitize = true; // sanitize molecules before returning them?
306 STR_VECT d_props; // vector of property names
307 bool df_title = true; // do we have a title line?
308 int d_smi = 0; // column id for the smile string
309 int d_name = 1; // column id for the name
310};
311
312//! lazy file parser for TDT files
314 /**************************************************************************
315 * Lazy file parser for TDT files, similar to the lazy SD
316 * file parser above
317 * - As an when new molecules are read using "next" their
318 * positions in the file are noted.
319 * - A call to the "length" will autamatically parse the entire
320 * file and cache all the mol block positions
321 * - [] operator is used to access a molecule at "idx", calling
322 * next following this will result in the next molecule after
323 * "idx"
324 ***************************************************************************/
325 public:
326 /*!
327 * \param fileName - the name of the TDT file
328 * \param nameRecord - property name for the molecule name.
329 * If empty (the default), the name defaults to be empty
330 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
331 * structure (depiction) in the input will be read into the
332 * corresponding conformer id.
333 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
334 * structure (depiction) in the input will be read into the
335 * corresponding conformer id.
336 * \param sanitize - if true sanitize the molecule before returning it
337 */
338 explicit TDTMolSupplier(const std::string &fileName,
339 const std::string &nameRecord = "", int confId2D = -1,
340 int confId3D = 0, bool sanitize = true);
341 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
342 const std::string &nameRecord = "", int confId2D = -1,
343 int confId3D = 0, bool sanitize = true);
345 ~TDTMolSupplier() override { close(); }
346 void setData(const std::string &text, const std::string &nameRecord = "",
347 int confId2D = -1, int confId3D = 0, bool sanitize = true);
348 void init() override;
349 void reset() override;
350 ROMol *next() override;
351 bool atEnd() override;
352 void moveTo(unsigned int idx);
353 ROMol *operator[](unsigned int idx);
354 /*! \brief returns the text block for a particular item
355 *
356 * \param idx - which item to return
357 */
358 std::string getItemText(unsigned int idx);
359 unsigned int length();
360
361 private:
362 bool advanceToNextRecord();
363 void checkForEnd();
364 ROMol *parseMol(std::string inLine);
365
366 bool df_end = false; // have we reached the end of the file?
367 int d_len = 0; // total number of mols in the file
368 int d_last = 0; // the molecule we are ready to read
369 int d_line = 0; // line number we are currently on
370 int d_confId2D = -1; // id to use for 2D conformers
371 int d_confId3D = 0; // id to use for 3D conformers
372 std::vector<std::streampos>
373 d_molpos; // vector of positions in the file for molecules
374 bool df_sanitize = true; // sanitize molecules before returning them?
375 std::string d_nameProp =
376 ""; // local storage for the property providing mol names
377};
378
379//! lazy file parser for PDB files
381 public:
382 explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
383 bool sanitize = true, bool removeHs = true,
384 unsigned int flavor = 0,
385 bool proximityBonding = true);
386 explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
387 bool removeHs = true, unsigned int flavor = 0,
388 bool proximityBonding = true);
389
390 ~PDBMolSupplier() override { close(); }
391
392 void init() override;
393 void reset() override;
394 ROMol *next() override;
395 bool atEnd() override;
396
397 protected:
398 bool df_sanitize, df_removeHs, df_proximityBonding;
399 unsigned int d_flavor;
400};
401#ifdef RDK_BUILD_MAEPARSER_SUPPORT
402//! lazy file parser for MAE files
403class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
404 /**
405 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
406 * always requires taking ownership of the istream ptr, as the shared ptr will
407 * always clear it upon destruction.
408 */
409
410 public:
411 MaeMolSupplier() { init(); }
412
413 explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
414 bool sanitize = true, bool removeHs = true);
415
416 explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
417 bool sanitize = true, bool removeHs = true);
418
419 explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
420 bool removeHs = true);
421
422 ~MaeMolSupplier() override {}
423
424 void init() override;
425 void reset() override;
426 ROMol *next() override;
427 bool atEnd() override;
428
429 void close() override { dp_sInStream.reset(); }
430
431 private:
432 void moveToNextBlock();
433
434 protected:
435 bool df_sanitize, df_removeHs;
436 std::shared_ptr<schrodinger::mae::Reader> d_reader;
437 std::shared_ptr<schrodinger::mae::Block> d_next_struct;
438 std::shared_ptr<std::istream> dp_sInStream;
439 std::string d_stored_exc;
440};
441#endif // RDK_BUILD_MAEPARSER_SUPPORT
442} // namespace RDKit
443
444#endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
virtual void readMolProps(ROMol *)
void setProcessPropertyLists(bool val)
Definition: MolSupplier.h:140
ROMol * next() override
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
bool getProcessPropertyLists() const
Definition: MolSupplier.h:141
std::istream * openAndCheckStream(const std::string &filename)
Definition: MolSupplier.h:92
virtual bool atEnd()=0
virtual ROMol * next()=0
virtual void reset()=0
virtual void init()=0
virtual ~MolSupplier()
Definition: MolSupplier.h:62
virtual void close()
Definition: MolSupplier.h:68
lazy file parser for PDB files
Definition: MolSupplier.h:380
~PDBMolSupplier() override
Definition: MolSupplier.h:390
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
bool atEnd() override
void reset() override
ROMol * next() override
void init() override
PDBMolSupplier(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
unsigned int d_flavor
Definition: MolSupplier.h:399
void setStreamIndices(const std::vector< std::streampos > &locs)
ROMol * next() override
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
bool atEnd() override
ROMol * operator[](unsigned int idx)
unsigned int length()
void reset() override
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
void moveTo(unsigned int idx)
void init() override
~SDMolSupplier() override
Definition: MolSupplier.h:189
lazy file parser for Smiles tables
Definition: MolSupplier.h:230
~SmilesMolSupplier() override
Definition: MolSupplier.h:273
void moveTo(unsigned int idx)
ROMol * next() override
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void init() override
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void reset() override
ROMol * operator[](unsigned int idx)
bool atEnd() override
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
lazy file parser for TDT files
Definition: MolSupplier.h:313
void moveTo(unsigned int idx)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
ROMol * operator[](unsigned int idx)
bool atEnd() override
~TDTMolSupplier() override
Definition: MolSupplier.h:345
void init() override
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol * next() override
void reset() override
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
unsigned int length()
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:153
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:18
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition: Dict.h:29