RDKit
Open-source cheminformatics and machine learning.
SubstructLibrary.h
Go to the documentation of this file.
1// Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
2// and other RDKit contributors
3//
4// All rights reserved.
5//
6// Redistribution and use in source and binary forms, with or without
7// modification, are permitted provided that the following conditions are
8// met:
9//
10// * Redistributions of source code must retain the above copyright
11// notice, this list of conditions and the following disclaimer.
12// * Redistributions in binary form must reproduce the above
13// copyright notice, this list of conditions and the following
14// disclaimer in the documentation and/or other materials provided
15// with the distribution.
16// * Neither the name of Novartis Institutes for BioMedical Research Inc.
17// nor the names of its contributors may be used to endorse or promote
18// products derived from this software without specific prior written
19// permission.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//
33#ifndef RDK_SUBSTRUCT_LIBRARY
34#define RDK_SUBSTRUCT_LIBRARY
35#include <utility>
36
37#include <RDGeneral/export.h>
38#include <GraphMol/RDKitBase.h>
39#include <GraphMol/MolPickler.h>
40#include <GraphMol/MolBundle.h>
46#include <DataStructs/BitOps.h>
47#include <GraphMol/MolOps.h>
49
50#include <algorithm>
51#include <string>
52#include <boost/lexical_cast.hpp>
53
54namespace RDKit {
55
57
58//! Base class API for holding molecules to substructure search.
59/*!
60 This is an API that hides the implementation details used for
61 indexing molecules for substructure searching. It simply
62 provides an API for adding and getting molecules from a set.
63 */
65 public:
66 virtual ~MolHolderBase() {}
67
68 //! Add a new molecule to the substructure search library
69 //! Returns the molecules index in the library
70 virtual unsigned int addMol(const ROMol &m) = 0;
71
72 // implementations should throw IndexError on out of range
73 virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
74
75 //! Get the current library size
76 virtual unsigned int size() const = 0;
77};
78
79//! Concrete class that holds molecules in memory
80/*!
81 This is currently one of the faster implementations.
82 However it is very memory intensive.
83*/
85 std::vector<boost::shared_ptr<ROMol>> mols;
86
87 public:
88 MolHolder() : MolHolderBase(), mols() {}
89
90 unsigned int addMol(const ROMol &m) override {
91 mols.push_back(boost::make_shared<ROMol>(m));
92 return size() - 1;
93 }
94
95 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
96 if (idx >= mols.size()) {
97 throw IndexErrorException(idx);
98 }
99 return mols[idx];
100 }
101
102 unsigned int size() const override {
103 return rdcast<unsigned int>(mols.size());
104 }
105
106 std::vector<boost::shared_ptr<ROMol>> &getMols() { return mols; }
107 const std::vector<boost::shared_ptr<ROMol>> &getMols() const { return mols; }
108};
109
110//! Concrete class that holds binary cached molecules in memory
111/*!
112 This implementation uses quite a bit less memory than the
113 non cached implementation. However, due to the reduced speed
114 it should be used in conjunction with a pattern fingerprinter.
115
116 See RDKit::FPHolder
117*/
119 std::vector<std::string> mols;
120
121 public:
123
124 unsigned int addMol(const ROMol &m) override {
125 mols.emplace_back();
126 MolPickler::pickleMol(m, mols.back());
127 return size() - 1;
128 }
129
130 //! Adds a pickled binary molecule, no validity checking of the input
131 //! is done.
132 unsigned int addBinary(const std::string &pickle) {
133 mols.push_back(pickle);
134 return size() - 1;
135 }
136
137 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
138 if (idx >= mols.size()) {
139 throw IndexErrorException(idx);
140 }
141 boost::shared_ptr<ROMol> mol(new ROMol);
142 MolPickler::molFromPickle(mols[idx], mol.get());
143 return mol;
144 }
145
146 unsigned int size() const override {
147 return rdcast<unsigned int>(mols.size());
148 }
149
150 std::vector<std::string> &getMols() { return mols; }
151 const std::vector<std::string> &getMols() const { return mols; }
152};
153
154//! Concrete class that holds smiles strings in memory
155/*!
156 This implementation uses quite a bit less memory than the
157 cached binary or uncached implementation. However, due to the
158 reduced speed it should be used in conjunction with a pattern
159 fingerprinter.
160
161 See RDKit::FPHolder
162*/
164 : public MolHolderBase {
165 std::vector<std::string> mols;
166
167 public:
169
170 unsigned int addMol(const ROMol &m) override {
171 bool doIsomericSmiles = true;
172 mols.push_back(MolToSmiles(m, doIsomericSmiles));
173 return size() - 1;
174 }
175
176 //! Add a smiles to the dataset, no validation is done
177 //! to the inputs.
178 unsigned int addSmiles(const std::string &smiles) {
179 mols.push_back(smiles);
180 return size() - 1;
181 }
182
183 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
184 if (idx >= mols.size()) {
185 throw IndexErrorException(idx);
186 }
187
188 boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
189 return mol;
190 }
191
192 unsigned int size() const override {
193 return rdcast<unsigned int>(mols.size());
194 }
195
196 std::vector<std::string> &getMols() { return mols; }
197 const std::vector<std::string> &getMols() const { return mols; }
198};
199
200//! Concrete class that holds trusted smiles strings in memory
201/*!
202 A trusted smiles is essentially a smiles string that
203 RDKit has generated. This indicates that fewer
204 sanitization steps are required. See
205 http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
206
207 This implementation uses quite a bit less memory than the
208 cached binary or uncached implementation. However, due to the
209 reduced speed it should be used in conjunction with a pattern
210 fingerprinter.
211
212 See RDKit::FPHolder
213*/
215 : public MolHolderBase {
216 std::vector<std::string> mols;
217
218 public:
220
221 unsigned int addMol(const ROMol &m) override {
222 bool doIsomericSmiles = true;
223 mols.push_back(MolToSmiles(m, doIsomericSmiles));
224 return size() - 1;
225 }
226
227 //! Add a smiles to the dataset, no validation is done
228 //! to the inputs.
229 unsigned int addSmiles(const std::string &smiles) {
230 mols.push_back(smiles);
231 return size() - 1;
232 }
233
234 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
235 if (idx >= mols.size()) {
236 throw IndexErrorException(idx);
237 }
238
239 RWMol *m = SmilesToMol(mols[idx], 0, false);
240 if (m) {
241 m->updatePropertyCache();
242 }
243 return boost::shared_ptr<ROMol>(m);
244 }
245
246 unsigned int size() const override {
247 return rdcast<unsigned int>(mols.size());
248 }
249
250 std::vector<std::string> &getMols() { return mols; }
251 const std::vector<std::string> &getMols() const { return mols; }
252};
253
254//! Base FPI for the fingerprinter used to rule out impossible matches
256 std::vector<ExplicitBitVect *> fps;
257
258 public:
259 virtual ~FPHolderBase() {
260 for (size_t i = 0; i < fps.size(); ++i) {
261 delete fps[i];
262 }
263 }
264
265 virtual unsigned int size() const { return rdcast<unsigned int>(fps.size()); }
266
267 //! Adds a molecule to the fingerprinter
268 unsigned int addMol(const ROMol &m) {
269 fps.push_back(makeFingerprint(m));
270 return rdcast<unsigned int>(fps.size() - 1);
271 }
272
273 //! Adds a raw bit vector pointer to the fingerprinter, which takes ownership
274 //! PLEASE NOTE: make sure that the passed ExplicitBitVect
275 //! is compatible with the one generated by makeFingerprint()
277 fps.push_back(v);
278 return rdcast<unsigned int>(fps.size() - 1);
279 }
280
281 //! Adds a raw bit vector to the fingerprinter
282 //! PLEASE NOTE: make sure that the passed ExplicitBitVect
283 //! is compatible with the one generated by makeFingerprint()
284 unsigned int addFingerprint(const ExplicitBitVect &v) {
285 return addFingerprint(new ExplicitBitVect(v));
286 }
287
288 //! Return false if a substructure search can never match the molecule
289 bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
290 if (idx >= fps.size()) {
291 throw IndexErrorException(idx);
292 }
293
294 return AllProbeBitsMatch(query, *fps[idx]);
295 }
296
297 //! Get the bit vector at the specified index (throws IndexError if out of
298 //! range)
299 const ExplicitBitVect &getFingerprint(unsigned int idx) const {
300 if (idx >= fps.size()) {
301 throw IndexErrorException(idx);
302 }
303 return *fps[idx];
304 }
305
306 //! make the query vector
307 //! Caller owns the vector!
308 virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
309
310 std::vector<ExplicitBitVect *> &getFingerprints() { return fps; }
311 const std::vector<ExplicitBitVect *> &getFingerprints() const { return fps; }
312};
313
314//! Uses the pattern fingerprinter with a user-defined number of bits (default:
315//! 2048) to rule out matches
317 unsigned int numBits;
318
319 public:
320 PatternHolder() : FPHolderBase(), numBits(defaultNumBits()) {}
321 PatternHolder(unsigned int numBits) : FPHolderBase(), numBits(numBits) {}
322 //! Caller owns the vector!
323 ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
324 return PatternFingerprintMol(m, numBits);
325 }
326 const unsigned int &getNumBits() const { return numBits; };
327 unsigned int &getNumBits() { return numBits; };
328 static unsigned int defaultNumBits() {
329 static const unsigned int DEFAULT_NUM_BITS = 2048;
330 return DEFAULT_NUM_BITS;
331 };
332};
333
335 : public PatternHolder {
336 public:
338 TautomerPatternHolder(unsigned int numBits) : PatternHolder(numBits) {}
339 ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
340 std::vector<unsigned int> *atomCounts = nullptr;
341 ExplicitBitVect *setOnlyBits = nullptr;
342 const bool tautomericFingerprint = true;
343 return PatternFingerprintMol(m, getNumBits(), atomCounts, setOnlyBits,
344 tautomericFingerprint);
345 }
346};
347
349 public:
350 virtual ~KeyHolderBase() {}
351
352 //! Add a key to the database getting it from the molecule
353 virtual unsigned int addMol(const ROMol &m) = 0;
354
355 //! Add a key to the database, this needs to be in the same order
356 //! as the molecule, no validation is done
357 virtual unsigned int addKey(const std::string &) = 0;
358
359 // !get the key at the requested index
360 // implementations should throw IndexError on out of range
361 virtual const std::string &getKey(unsigned int) const = 0;
362
363 // !get keys from a bunch of indices
364 virtual std::vector<std::string> getKeys(
365 const std::vector<unsigned int> &indices) const = 0;
366 //! Get the current keeyholder size
367 virtual unsigned int size() const = 0;
368};
369
371 std::string propname;
372 std::vector<std::string> keys;
373 const std::string empty_string = {};
374
375 public:
376 KeyFromPropHolder(const std::string &propname = "_Name")
377 : propname(propname) {}
378
379 std::string &getPropName() { return propname; }
380 const std::string &getPropName() const { return propname; }
381
382 std::vector<std::string> &getKeys() { return keys; }
383 const std::vector<std::string> &getKeys() const { return keys; }
384
385 unsigned int addMol(const ROMol &m) override {
386 std::string key;
387 if (m.getPropIfPresent(propname, key)) {
388 keys.push_back(std::move(key));
389 } else {
390 // XXX is this a warning? it could be verbose. Should we push back the
391 // string repr of the
392 // numeric index?
393 const static std::string prefix("LIBIDX-");
394 keys.emplace_back(prefix + boost::lexical_cast<std::string>(keys.size()));
395 }
396 return keys.size() - 1u;
397 };
398
399 unsigned int addKey(const std::string &key) override {
400 keys.push_back(key);
401 return keys.size() - 1u;
402 }
403
404 const std::string &getKey(unsigned int idx) const override {
405 if (idx >= keys.size()) {
406 throw IndexErrorException(idx);
407 }
408 return keys[idx];
409 }
410
411 std::vector<std::string> getKeys(
412 const std::vector<unsigned int> &indices) const override {
413 std::vector<std::string> res;
414 std::transform(indices.begin(), indices.end(), std::back_inserter(res),
415 [=](unsigned idx) { return keys.at(idx); });
416 return res;
417 }
418 unsigned int size() const override { return keys.size(); }
419};
420
421//! Substructure Search a library of molecules
422/*! This class allows for multithreaded substructure searches of
423 large datasets.
424
425 The implementations can use fingerprints to speed up searches
426 and have molecules cached as binary forms to reduce memory
427 usage.
428
429 basic usage:
430 \code
431 SubstructLibrary lib;
432 lib.addMol(mol);
433 std::vector<unsigned int> results = lib.getMatches(query);
434 for(std::vector<unsigned int>::const_iterator matchIndex=results.begin();
435 matchIndex != results.end();
436 ++matchIndex) {
437 boost::shared_ptr<ROMol> match = lib.getMol(*matchIndex);
438 }
439 \endcode
440
441 Using different mol holders and pattern fingerprints.
442
443 \code
444 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
445 boost::make_shared<CachedTrustedSmilesMolHolder>();
446 boost::shared_ptr<PatternHolder> patternHolder = \
447 boost::make_shared<PatternHolder>();
448
449 SubstructLibrary lib(molHolder, patternHolder);
450 lib.addMol(mol);
451 \endcode
452
453 Cached molecule holders create molecules on demand. There are currently
454 three styles of cached molecules.
455
456 CachedMolHolder: stores molecules in the rdkit binary format.
457 CachedSmilesMolHolder: stores molecules in smiles format.
458 CachedTrustedSmilesMolHolder: stores molecules in smiles format.
459
460 The CachedTrustedSmilesMolHolder is made to add molecules from
461 a trusted source. This makes the basic assumption that RDKit was
462 used to sanitize and canonicalize the smiles string. In practice
463 this is considerably faster than using arbitrary smiles strings since
464 certain assumptions can be made. Molecules generated from trusted
465 smiles do not have ring information (although this is created
466 in the molecule being searched if necessary).
467
468 When loading from external data, as opposed to using the "addMol" API,
469 care must be taken to ensure that the pattern fingerprints and smiles
470 are synchronized.
471
472 Each pattern holder has an API point for making its fingerprint. This
473 is useful to ensure that the pattern stored in the database will be
474 compatible with the patterns made when analyzing queries.
475
476 \code
477 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
478 boost::make_shared<CachedTrustedSmilesMolHolder>();
479 boost::shared_ptr<PatternHolder> patternHolder = \
480 boost::make_shared<PatternHolder>();
481
482 // the PatternHolder instance is able to make fingerprints.
483 // These, of course, can be read from a file. For demonstration
484 // purposes we construct them here.
485 const std::string trustedSmiles = "c1ccccc1";
486 ROMol *m = SmilesToMol(trustedSmiles);
487 const ExplicitBitVect *bitVector = patternHolder->makeFingerprint(*m);
488
489 // The trusted smiles and bitVector can be read from any source.
490 // This is the fastest way to load a substruct library.
491 molHolder->addSmiles( trustedSmiles );
492 patternHolder->addFingerprint( *bitVector );
493 SubstructLibrary lib(molHolder, patternHolder);
494 delete m;
495 delete bitVector;
496 \endcode
497
498 Finally, using the KeyFromPropHolder will store user ids or keys.
499 By default, it uses RDKit's default _Name prop, but can be changed
500 to any property.
501
502 \code
503 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
504 boost::make_shared<CachedTrustedSmilesMolHolder>();
505 boost::shared_ptr<KeyFromPropHolder> keyHolder = \
506 boost::make_shared<KeyFromPropHolder>();
507 SubstructLibrary lib(molHolder, keyHolder);
508 ...
509
510 You can get the keys in multiple through the use of the keyholder
511 auto key = lib.getKeys().getKey(idx);
512 auto keys = lib.getKeys().getKeys(lib.GetMatch(query));
513 \endcode
514
515*/
517 boost::shared_ptr<MolHolderBase> molholder;
518 boost::shared_ptr<FPHolderBase> fpholder;
519 boost::shared_ptr<KeyHolderBase> keyholder;
520
521 MolHolderBase *mols; // used for a small optimization
522 FPHolderBase *fps{nullptr};
523 bool is_tautomerquery = false;
524 std::vector<unsigned int> searchOrder;
525
526 public:
528 : molholder(new MolHolder),
529 fpholder(),
530 keyholder(),
531 mols(molholder.get()) {}
532
533 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
534 : molholder(std::move(molecules)),
535 fpholder(),
536 keyholder(),
537 mols(molholder.get()),
538 fps(nullptr) {}
539
540 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
541 boost::shared_ptr<FPHolderBase> fingerprints)
542 : molholder(std::move(molecules)),
543 fpholder(std::move(fingerprints)),
544 keyholder(),
545 mols(molholder.get()),
546 fps(fpholder.get()) {
547 if (fpholder.get() &&
548 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
549 is_tautomerquery = true;
550 }
551 }
552
553 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
554 boost::shared_ptr<KeyHolderBase> keys)
555 : molholder(std::move(molecules)),
556 fpholder(),
557 keyholder(std::move(keys)),
558 mols(molholder.get()),
559 fps(nullptr) {
560 if (fpholder.get() &&
561 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
562 is_tautomerquery = true;
563 }
564 }
565
566 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
567 boost::shared_ptr<FPHolderBase> fingerprints,
568 boost::shared_ptr<KeyHolderBase> keys)
569 : molholder(std::move(molecules)),
570 fpholder(std::move(fingerprints)),
571 keyholder(std::move(keys)),
572 mols(molholder.get()),
573 fps(fpholder.get()) {
574 if (fpholder.get() &&
575 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
576 is_tautomerquery = true;
577 }
578 }
579
580 SubstructLibrary(const std::string &pickle)
581 : molholder(new MolHolder),
582 fpholder(),
583 mols(molholder.get()),
584 fps(nullptr) {
585 initFromString(pickle);
586 if (fpholder.get() &&
587 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
588 is_tautomerquery = true;
589 }
590 }
591
592 //! Get the underlying molecule holder implementation
593 boost::shared_ptr<MolHolderBase> &getMolHolder() { return molholder; }
594
595 const boost::shared_ptr<MolHolderBase> &getMolHolder() const {
596 return molholder;
597 }
598
599 //! Get the underlying molecule holder implementation
600 boost::shared_ptr<FPHolderBase> &getFpHolder() { return fpholder; }
601
602 //! Get the underlying molecule holder implementation
603 const boost::shared_ptr<FPHolderBase> &getFpHolder() const {
604 return fpholder;
605 }
606
607 //! Get the underlying molecule holder implementation
608 boost::shared_ptr<KeyHolderBase> &getKeyHolder() { return keyholder; }
609
610 //! Get the underlying molecule holder implementation
611 const boost::shared_ptr<KeyHolderBase> &getKeyHolder() const {
612 return keyholder;
613 }
614
616 PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
617 return *mols;
618 }
619
620 //! Get the underlying fingerprint implementation.
621 /*! Throws a value error if no fingerprints have been set */
623 if (!fps) {
624 throw ValueErrorException("Substruct Library does not have fingerprints");
625 }
626 return *fps;
627 }
628
630 if (!fps) {
631 throw ValueErrorException("Substruct Library does not have fingerprints");
632 }
633 return *fps;
634 }
635
636 //! Get the underlying key holder implementation.
637 /*! Throws a value error if no keyholder have been set */
639 if (!keyholder.get()) {
640 throw ValueErrorException("Substruct Library does not have fingerprints");
641 }
642 return *keyholder.get();
643 }
644
645 //! Get the underlying key holder implementation.
646 /*! Throws a value error if no keyholder have been set */
647 const KeyHolderBase &getKeys() const {
648 if (!keyholder.get()) {
649 throw ValueErrorException("Substruct Library does not have fingerprints");
650 }
651 return *keyholder.get();
652 }
653
654 //! Add a molecule to the library
655 /*!
656 \param mol Molecule to add
657
658 returns index for the molecule in the library
659 */
660 unsigned int addMol(const ROMol &mol);
661
662 //! Get the matching indices for the query
663 /*!
664 \param query Query or Tautomer Query to match against molecules
665 \param recursionPossible flags whether or not recursive matches are allowed
666 [default true]
667 \param useChirality use atomic CIP codes as part of the comparison
668 [default true]
669 \param useQueryQueryMatches if set, the contents of atom and bond queries
670 will be used as part of the matching
671 [default false]
672 \param numThreads If -1 use all available processors [default -1]
673 \param maxResults Maximum results to return, -1 means return all
674 [default -1]
675 */
676 template <class Query>
677 std::vector<unsigned int> getMatches(const Query &query,
678 bool recursionPossible = true,
679 bool useChirality = true,
680 bool useQueryQueryMatches = false,
681 int numThreads = -1,
682 int maxResults = -1) const {
684 params.recursionPossible = recursionPossible;
685 params.useChirality = useChirality;
686 params.useQueryQueryMatches = useQueryQueryMatches;
687 return getMatches(query, 0, size(), params, numThreads, maxResults);
688 }
689 //! overload
690 template <class Query>
691 std::vector<unsigned int> getMatches(const Query &query,
692 const SubstructMatchParameters &params,
693 int numThreads = -1,
694 int maxResults = -1) const {
695 return getMatches(query, 0, size(), params, numThreads, maxResults);
696 }
697 //! Get the matching indices for the query between the given indices
698 /*!
699 \param query Query to match against molecules
700 \param startIdx Start index of the search
701 \param endIdx Ending idx (non-inclusive) of the search.
702 \param recursionPossible flags whether or not recursive matches are allowed
703 [default true]
704 \param useChirality use atomic CIP codes as part of the comparison
705 [default true]
706 \param useQueryQueryMatches if set, the contents of atom and bond queries
707 will be used as part of the matching
708 [default false]
709 \param numThreads If -1 use all available processors [default -1]
710 \param maxResults Maximum results to return, -1 means return all
711 [default -1]
712 */
713 template <class Query>
714 std::vector<unsigned int> getMatches(
715 const Query &query, unsigned int startIdx, unsigned int endIdx,
716 bool recursionPossible = true, bool useChirality = true,
717 bool useQueryQueryMatches = false, int numThreads = -1,
718 int maxResults = -1) const {
720 params.recursionPossible = recursionPossible;
721 params.useChirality = useChirality;
722 params.useQueryQueryMatches = useQueryQueryMatches;
723 return getMatches(query, startIdx, endIdx, params, numThreads, maxResults);
724 };
725 //! overload
726 std::vector<unsigned int> getMatches(const ROMol &query,
727 unsigned int startIdx,
728 unsigned int endIdx,
729 const SubstructMatchParameters &params,
730 int numThreads = -1,
731 int maxResults = -1) const;
732 //! overload
733 std::vector<unsigned int> getMatches(const MolBundle &query,
734 unsigned int startIdx,
735 unsigned int endIdx,
736 const SubstructMatchParameters &params,
737 int numThreads = -1,
738 int maxResults = -1) const;
739 //! overload
740 std::vector<unsigned int> getMatches(const TautomerQuery &query,
741 unsigned int startIdx,
742 unsigned int endIdx,
743 const SubstructMatchParameters &params,
744 int numThreads = -1,
745 int maxResults = -1) const;
746
747 //! Return the number of matches for the query
748 /*!
749 \param query Molecule or Tautomer Query to match against molecules
750 \param recursionPossible flags whether or not recursive matches are allowed
751 [default true]
752 \param useChirality use atomic CIP codes as part of the comparison
753 [default true]
754 \param useQueryQueryMatches if set, the contents of atom and bond queries
755 will be used as part of the matching
756 [default false]
757 \param numThreads If -1 use all available processors [default -1]
758 */
759 template <class Query>
760 unsigned int countMatches(const Query &query, bool recursionPossible = true,
761 bool useChirality = true,
762 bool useQueryQueryMatches = false,
763 int numThreads = -1) const {
765 params.recursionPossible = recursionPossible;
766 params.useChirality = useChirality;
767 params.useQueryQueryMatches = useQueryQueryMatches;
768 return countMatches(query, 0, size(), params, numThreads);
769 }
770 //! overload
771 template <class Query>
772 unsigned int countMatches(const Query &query,
773 const SubstructMatchParameters &params,
774 int numThreads = -1) const {
775 return countMatches(query, 0, size(), params, numThreads);
776 }
777
778 //! Return the number of matches for the query
779
780 //! Return the number of matches for the query between the given indices
781 /*!
782 \param query Query to match against molecules
783 \param startIdx Start index of the search
784 \param endIdx Ending idx (non-inclusive) of the search.
785 \param recursionPossible flags whether or not recursive matches are allowed
786 [default true]
787 \param useChirality use atomic CIP codes as part of the comparison
788 [default true]
789 \param useQueryQueryMatches if set, the contents of atom and bond queries
790 will be used as part of the matching
791 [default false]
792 \param numThreads If -1 use all available processors [default -1]
793 */
794 template <class Query>
795 unsigned int countMatches(const Query &query, unsigned int startIdx,
796 unsigned int endIdx, bool recursionPossible = true,
797 bool useChirality = true,
798 bool useQueryQueryMatches = false,
799 int numThreads = -1) const {
801 params.recursionPossible = recursionPossible;
802 params.useChirality = useChirality;
803 params.useQueryQueryMatches = useQueryQueryMatches;
804 return countMatches(query, startIdx, endIdx, params, numThreads);
805 };
806
807 //! overload
808 unsigned int countMatches(const ROMol &query, unsigned int startIdx,
809 unsigned int endIdx,
810 const SubstructMatchParameters &params,
811 int numThreads = -1) const;
812 //! overload
813 unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx,
814 unsigned int endIdx,
815 const SubstructMatchParameters &params,
816 int numThreads = -1) const;
817 //! overload
818 unsigned int countMatches(const MolBundle &query, unsigned int startIdx,
819 unsigned int endIdx,
820 const SubstructMatchParameters &params,
821 int numThreads = -1) const;
822
823 //! Returns true if any match exists for the query
824 /*!
825 \param query Molecule or Tautomer Query to match against molecules
826 \param recursionPossible flags whether or not recursive matches are allowed
827 [default true]
828 \param useChirality use atomic CIP codes as part of the comparison
829 [default true]
830 \param useQueryQueryMatches if set, the contents of atom and bond queries
831 will be used as part of the matching
832 [default false]
833 \param numThreads If -1 use all available processors [default -1]
834 */
835 template <class Query>
836 bool hasMatch(const Query &query, bool recursionPossible = true,
837 bool useChirality = true, bool useQueryQueryMatches = false,
838 int numThreads = -1) const {
840 params.recursionPossible = recursionPossible;
841 params.useChirality = useChirality;
842 params.useQueryQueryMatches = useQueryQueryMatches;
843 return hasMatch(query, 0, size(), params, numThreads);
844 }
845 //! overload
846 template <class Query>
847 bool hasMatch(const Query &query, const SubstructMatchParameters &params,
848 int numThreads = -1) const {
849 return hasMatch(query, 0, size(), params, numThreads);
850 }
851 //! Returns true if any match exists for the query between the specified
852 //! indices
853 /*!
854 \param query Query to match against molecules
855 \param startIdx Start index of the search
856 \param endIdx Ending idx (inclusive) of the search.
857 \param recursionPossible flags whether or not recursive matches are
858 allowed [default true] \param useChirality use atomic CIP codes as part
859 of the comparison [default true] \param useQueryQueryMatches if set, the
860 contents of atom and bond queries will be used as part of the matching
861 [default false]
862 \param numThreads If -1 use all available processors [default -1]
863 */
864 template <class Query>
865 bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx,
866 bool recursionPossible = true, bool useChirality = true,
867 bool useQueryQueryMatches = false, int numThreads = -1) const {
869 params.recursionPossible = recursionPossible;
870 params.useChirality = useChirality;
871 params.useQueryQueryMatches = useQueryQueryMatches;
872 return hasMatch(query, startIdx, endIdx, params, numThreads);
873 };
874 //! overload
875 bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx,
876 const SubstructMatchParameters &params,
877 int numThreads = -1) const;
878 //! overload
879 bool hasMatch(const TautomerQuery &query, unsigned int startIdx,
880 unsigned int endIdx, const SubstructMatchParameters &params,
881 int numThreads = -1) const;
882 //! overload
883 bool hasMatch(const MolBundle &query, unsigned int startIdx,
884 unsigned int endIdx, const SubstructMatchParameters &params,
885 int numThreads = -1) const;
886 //! Returns the molecule at the given index
887 /*!
888 \param idx Index of the molecule in the library (n.b. could contain
889 null)
890 */
891 boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
892 // expects implementation to throw IndexError if out of range
893 PRECONDITION(mols, "molholder is null in SubstructLibrary");
894 return mols->getMol(idx);
895 }
896
897 //! Returns the molecule at the given index
898 /*!
899 \param idx Index of the molecule in the library (n.b. could contain
900 null)
901 */
902 boost::shared_ptr<ROMol> operator[](unsigned int idx) {
903 // expects implementation to throw IndexError if out of range
904 PRECONDITION(mols, "molholder is null in SubstructLibrary");
905 return mols->getMol(idx);
906 }
907
908 //! return the number of molecules in the library
909 unsigned int size() const {
910 PRECONDITION(mols, "molholder is null in SubstructLibrary");
911 return rdcast<unsigned int>(molholder->size());
912 }
913
914 //! does error checking
915 void setSearchOrder(const std::vector<unsigned int> &order) {
916 for (const auto idx : order) {
917 if (idx >= mols->size()) {
918 throw IndexErrorException(idx);
919 }
920 }
921 searchOrder = order;
922 }
923
924 const std::vector<unsigned int> &getSearchOrder() const {
925 return searchOrder;
926 }
927
928 std::vector<unsigned int> &getSearchOrder() { return searchOrder; }
929 //! access required for serialization
931 is_tautomerquery = false;
932 mols = molholder.get();
933 fps = fpholder.get();
934 if (fps && dynamic_cast<TautomerPatternHolder *>(fps) != nullptr) {
935 is_tautomerquery = true;
936 }
937 }
938
939 //! serializes (pickles) to a stream
940 void toStream(std::ostream &ss) const;
941 //! returns a string with a serialized (pickled) representation
942 std::string Serialize() const;
943 //! initializes from a stream pickle
944 void initFromStream(std::istream &ss);
945 //! initializes from a string pickle
946 void initFromString(const std::string &text);
947};
948} // namespace RDKit
949
951#endif
Contains general bit-comparison and similarity operations.
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
Defines a class for managing bundles of molecules.
pulls in the core RDKit functionality
a class for bit vectors that are densely occupied
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:20
Concrete class that holds binary cached molecules in memory.
std::vector< std::string > & getMols()
unsigned int size() const override
Get the current library size.
unsigned int addMol(const ROMol &m) override
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
const std::vector< std::string > & getMols() const
unsigned int addBinary(const std::string &pickle)
Concrete class that holds smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
const std::vector< std::string > & getMols() const
unsigned int addMol(const ROMol &m) override
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int size() const override
Get the current library size.
Concrete class that holds trusted smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
unsigned int addMol(const ROMol &m) override
const std::vector< std::string > & getMols() const
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int size() const override
Get the current library size.
Base FPI for the fingerprinter used to rule out impossible matches.
std::vector< ExplicitBitVect * > & getFingerprints()
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
virtual unsigned int size() const
const std::vector< ExplicitBitVect * > & getFingerprints() const
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
unsigned int addFingerprint(ExplicitBitVect *v)
const ExplicitBitVect & getFingerprint(unsigned int idx) const
unsigned int addFingerprint(const ExplicitBitVect &v)
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const =0
KeyFromPropHolder(const std::string &propname="_Name")
const std::string & getKey(unsigned int idx) const override
unsigned int addKey(const std::string &key) override
unsigned int size() const override
Get the current keeyholder size.
const std::vector< std::string > & getKeys() const
std::vector< std::string > & getKeys()
std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const override
const std::string & getPropName() const
unsigned int addMol(const ROMol &m) override
Add a key to the database getting it from the molecule.
virtual std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const =0
virtual const std::string & getKey(unsigned int) const =0
virtual unsigned int addMol(const ROMol &m)=0
Add a key to the database getting it from the molecule.
virtual unsigned int size() const =0
Get the current keeyholder size.
virtual unsigned int addKey(const std::string &)=0
MolBundle contains a collection of related ROMols.
Definition: MolBundle.h:39
Base class API for holding molecules to substructure search.
virtual unsigned int addMol(const ROMol &m)=0
virtual unsigned int size() const =0
Get the current library size.
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
Concrete class that holds molecules in memory.
unsigned int addMol(const ROMol &m) override
const std::vector< boost::shared_ptr< ROMol > > & getMols() const
unsigned int size() const override
Get the current library size.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
std::vector< boost::shared_ptr< ROMol > > & getMols()
static void molFromPickle(const std::string &pickle, ROMol *mol, unsigned int propertyFlags)
constructs a molecule from a pickle stored in a string
static void pickleMol(const ROMol *mol, std::ostream &ss)
pickles a molecule and sends the results to stream ss
PatternHolder(unsigned int numBits)
const unsigned int & getNumBits() const
unsigned int & getNumBits()
static unsigned int defaultNumBits()
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
Substructure Search a library of molecules.
unsigned int countMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
unsigned int addMol(const ROMol &mol)
Add a molecule to the library.
std::vector< unsigned int > getMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void initFromStream(std::istream &ss)
initializes from a stream pickle
KeyHolderBase & getKeys()
Get the underlying key holder implementation.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
boost::shared_ptr< MolHolderBase > & getMolHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Returns true if any match exists for the query.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints, boost::shared_ptr< KeyHolderBase > keys)
unsigned int countMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
std::vector< unsigned int > getMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > getMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
boost::shared_ptr< FPHolderBase > & getFpHolder()
Get the underlying molecule holder implementation.
std::vector< unsigned int > getMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query.
const MolHolderBase & getMolecules() const
void initFromString(const std::string &text)
initializes from a string pickle
unsigned int countMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const KeyHolderBase & getKeys() const
Get the underlying key holder implementation.
const boost::shared_ptr< KeyHolderBase > & getKeyHolder() const
Get the underlying molecule holder implementation.
bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
unsigned int countMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
void setSearchOrder(const std::vector< unsigned int > &order)
does error checking
bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
const FPHolderBase & getFingerprints() const
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< KeyHolderBase > keys)
bool hasMatch(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
unsigned int countMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
bool hasMatch(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
boost::shared_ptr< KeyHolderBase > & getKeyHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const std::vector< unsigned int > & getSearchOrder() const
void resetHolders()
access required for serialization
unsigned int size() const
return the number of molecules in the library
std::vector< unsigned int > getMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query between the given indices.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
SubstructLibrary(const std::string &pickle)
std::string Serialize() const
returns a string with a serialized (pickled) representation
unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
std::vector< unsigned int > getMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void toStream(std::ostream &ss) const
serializes (pickles) to a stream
boost::shared_ptr< ROMol > operator[](unsigned int idx)
Returns the molecule at the given index.
const boost::shared_ptr< FPHolderBase > & getFpHolder() const
Get the underlying molecule holder implementation.
const boost::shared_ptr< MolHolderBase > & getMolHolder() const
std::vector< unsigned int > & getSearchOrder()
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
TautomerPatternHolder(unsigned int numBits)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
#define RDKIT_SUBSTRUCTLIBRARY_EXPORT
Definition: export.h:473
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
Std stuff.
Definition: Abbreviations.h:18
RDKIT_SMILESPARSE_EXPORT RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &params)
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=nullptr, ExplicitBitVect *setOnlyBits=nullptr, bool tautomericFingerprint=false)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SUBSTRUCTLIBRARY_EXPORT bool SubstructLibraryCanSerialize()
bool recursionPossible
Allow recursive queries.