RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1//
2// This file is part of the CCTBX distribution:
3// http://cctbx.sourceforge.net/
4// Downloaded from here:
5// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6//
7// Copyright (c) 2006, The Regents of the University of
8// California, through Lawrence Berkeley National Laboratory (subject to
9// receipt of any required approvals from the U.S. Dept. of Energy). All
10// rights reserved.
11//
12// The license is here:
13// http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14//
15#include <RDGeneral/export.h>
16#ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17#define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19#include <boost/python/object.hpp>
20#include <boost/python/str.hpp>
21#include <boost/python/extract.hpp>
22
23#include <boost/optional.hpp>
24#include <boost/utility/typed_in_place_factory.hpp>
26
27//#include <tbxx/error_utils.hpp>
28#include <RDGeneral/Invariant.h>
30
31#include <streambuf>
32#include <iostream>
33
34namespace boost_adaptbx {
35namespace python {
36
37namespace bp = boost::python;
38
39/// A stream buffer getting data from and putting data into a Python file object
40/** The aims are as follow:
41
42 - Given a C++ function acting on a standard stream, e.g.
43
44 \code
45 void read_inputs(std::istream& input) {
46 ...
47 input >> something >> something_else;
48 }
49 \endcode
50
51 and given a piece of Python code which creates a file-like object,
52 to be able to pass this file object to that C++ function, e.g.
53
54 \code
55 import gzip
56 gzip_file_obj = gzip.GzipFile(...)
57 read_inputs(gzip_file_obj)
58 \endcode
59
60 and have the standard stream pull data from and put data into the Python
61 file object.
62
63 - When Python \c read_inputs() returns, the Python object is able to
64 continue reading or writing where the C++ code left off.
65
66 - Operations in C++ on mere files should be competitively fast compared
67 to the direct use of \c std::fstream.
68
69
70 \b Motivation
71
72 - the standard Python library offer of file-like objects (files,
73 compressed files and archives, network, ...) is far superior to the
74 offer of streams in the C++ standard library and Boost C++ libraries.
75
76 - i/o code involves a fair amount of text processing which is more
77 efficiently prototyped in Python but then one may need to rewrite
78 a time-critical part in C++, in as seamless a manner as possible.
79
80 \b Usage
81
82 This is 2-step:
83
84 - a trivial wrapper function
85
86 \code
87 using boost_adaptbx::python::streambuf;
88 void read_inputs_wrapper(streambuf& input)
89 {
90 streambuf::istream is(input);
91 read_inputs(is);
92 }
93
94 def("read_inputs", read_inputs_wrapper);
95 \endcode
96
97 which has to be written every time one wants a Python binding for
98 such a C++ function.
99
100 - the Python side
101
102 \code
103 from boost.python import streambuf
104 read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
105 \endcode
106
107 \c buffer_size is optional. See also: \c default_buffer_size
108
109 Note: references are to the C++ standard (the numbers between parentheses
110 at the end of references are margin markers).
111*/
112class streambuf : public std::basic_streambuf<char> {
113 private:
114 typedef std::basic_streambuf<char> base_t;
115
116 public:
117 /* The syntax
118 using base_t::char_type;
119 would be nicer but Visual Studio C++ 8 chokes on it
120 */
121 typedef base_t::char_type char_type;
122 typedef base_t::int_type int_type;
123 typedef base_t::pos_type pos_type;
124 typedef base_t::off_type off_type;
125 typedef base_t::traits_type traits_type;
126
127 // work around Visual C++ 7.1 problem
128 inline static int traits_type_eof() { return traits_type::eof(); }
129
130 /// The default size of the read and write buffer.
131 /** They are respectively used to buffer data read from and data written to
132 the Python file object. It can be modified from Python.
133 */
134 const static std::size_t default_buffer_size = 1024;
135
136 /// Construct from a Python file object
137 /** if buffer_size is 0 the current default_buffer_size is used.
138 */
139 streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
140 : py_read(getattr(python_file_obj, "read", bp::object())),
141 py_write(getattr(python_file_obj, "write", bp::object())),
142 py_seek(getattr(python_file_obj, "seek", bp::object())),
143 py_tell(getattr(python_file_obj, "tell", bp::object())),
144 buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
145 write_buffer(nullptr),
146 pos_of_read_buffer_end_in_py_file(0),
147 pos_of_write_buffer_end_in_py_file(buffer_size),
148 farthest_pptr(nullptr) {
149 TEST_ASSERT(buffer_size != 0);
150 /* Some Python file objects (e.g. sys.stdout and sys.stdin)
151 have non-functional seek and tell. If so, assign None to
152 py_tell and py_seek.
153 */
154 if (py_tell != bp::object()) {
155 try {
156 off_type py_pos = bp::extract<off_type>(py_tell());
157 if (py_seek != bp::object()) {
158 /* Make sure we can actually seek.
159 bzip2 readers from python have a seek method, but it fails
160 when they are in write mode.
161 */
162 py_seek(py_pos);
163 }
164 } catch (bp::error_already_set&) {
165 py_tell = bp::object();
166 py_seek = bp::object();
167 /* Boost.Python does not do any Python exception handling whatsoever
168 So we need to catch it by hand like so.
169 */
170 PyErr_Clear();
171 }
172 }
173
174 if (py_write != bp::object()) {
175 // C-like string to make debugging easier
176 write_buffer = new char[buffer_size + 1];
177 write_buffer[buffer_size] = '\0';
178 setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
179 farthest_pptr = pptr();
180 } else {
181 // The first attempt at output will result in a call to overflow
182 setp(nullptr, nullptr);
183 }
184
185 if (py_tell != bp::object()) {
186 off_type py_pos = bp::extract<off_type>(py_tell());
187 pos_of_read_buffer_end_in_py_file = py_pos;
188 pos_of_write_buffer_end_in_py_file = py_pos;
189 }
190 }
191
192 /// constructor to enforce a mode (binary or text)
193 streambuf(bp::object& python_file_obj, char mode,
194 std::size_t buffer_size_ = 0)
195 : streambuf(python_file_obj, buffer_size_) {
196#if 1
197 bp::object io_mod = bp::import("io");
198 CHECK_INVARIANT(io_mod, "module not found");
199 bp::object iobase = io_mod.attr("TextIOBase");
200 CHECK_INVARIANT(iobase, "base class not found");
201#else
202 // using statics to save an undetermined amount of time results in
203 // alarming seg faults on windows. so we don't do it. Keep this here
204 // for the moment though in case someone manages to figure that out in
205 // the future
206 static bp::object io_mod = bp::object();
207 static bp::object iobase = bp::object();
208 if (!io_mod) io_mod = bp::import("io");
209 if (io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
210 CHECK_INVARIANT(io_mod, "module not found");
211 CHECK_INVARIANT(iobase, "base class not found");
212#endif
213
214 df_isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
215 switch (mode) {
216 case 's': /// yeah, is redundant, but it is somehow natural to do "s"
217 case 't':
218 if (!df_isTextMode) {
220 "Need a text mode file object like StringIO or a file opened "
221 "with mode 't'");
222 }
223 break;
224 case 'b':
225 if (df_isTextMode) {
227 "Need a binary mode file object like BytesIO or a file opened "
228 "with mode 'b'");
229 }
230 break;
231 default:
232 throw std::invalid_argument("bad mode character");
233 }
234 }
235
236 /// Mundane destructor freeing the allocated resources
237 ~streambuf() override {
238 if (write_buffer) {
239 delete[] write_buffer;
240 }
241 }
242
243 /// C.f. C++ standard section 27.5.2.4.3
244 /** It is essential to override this virtual function for the stream
245 member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
246 */
247 std::streamsize showmanyc() override {
248 int_type const failure = traits_type::eof();
249 int_type status = underflow();
250 if (status == failure) {
251 return -1;
252 }
253 return egptr() - gptr();
254 }
255
256 /// C.f. C++ standard section 27.5.2.4.3
257 int_type underflow() override {
258 int_type const failure = traits_type::eof();
259 if (py_read == bp::object()) {
260 throw std::invalid_argument(
261 "That Python file object has no 'read' attribute");
262 }
263 read_buffer = py_read(buffer_size);
264 char* read_buffer_data;
265 bp::ssize_t py_n_read;
266 if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
267 &py_n_read) == -1) {
268 setg(nullptr, nullptr, nullptr);
269 throw std::invalid_argument(
270 "The method 'read' of the Python file object "
271 "did not return a string.");
272 }
273 off_type n_read = (off_type)py_n_read;
274 pos_of_read_buffer_end_in_py_file += n_read;
275 setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
276 // ^^^27.5.2.3.1 (4)
277 if (n_read == 0) {
278 return failure;
279 }
280 return traits_type::to_int_type(read_buffer_data[0]);
281 }
282
283 /// C.f. C++ standard section 27.5.2.4.5
285 if (py_write == bp::object()) {
286 throw std::invalid_argument(
287 "That Python file object has no 'write' attribute");
288 }
289 farthest_pptr = std::max(farthest_pptr, pptr());
290 off_type n_written = (off_type)(farthest_pptr - pbase());
291 off_type orig_n_written = n_written;
292 const unsigned int STD_ASCII = 0x7F;
293 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII) {
294 // we're somewhere in the middle of a utf8 block. If we
295 // only write part of it we'll end up with an exception,
296 // so push everything that could be utf8 into the next block
297 while (n_written > 0 && static_cast<unsigned int>(
298 write_buffer[n_written - 1]) > STD_ASCII) {
299 --n_written;
300 }
301 }
302 bp::str chunk(pbase(), pbase() + n_written);
303 py_write(chunk);
304
305 if ((!df_isTextMode || static_cast<unsigned int>(c) <= STD_ASCII) &&
306 !traits_type::eq_int_type(c, traits_type::eof())) {
307 py_write(traits_type::to_char_type(c));
308 n_written++;
309 }
310
311 setp(pbase(), epptr());
312 // ^^^ 27.5.2.4.5 (5)
313 farthest_pptr = pptr();
314 if (n_written) {
315 pos_of_write_buffer_end_in_py_file += n_written;
316 if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII &&
317 !traits_type::eq_int_type(c, traits_type::eof())) {
318 size_t n_to_copy = orig_n_written - n_written;
319
320 for (size_t i = 0; i < n_to_copy; ++i) {
321 sputc(write_buffer[n_written + i]);
322 ++farthest_pptr;
323 }
324 sputc(c);
325 ++farthest_pptr;
326 }
327 }
328 return traits_type::eq_int_type(c, traits_type::eof())
329 ? traits_type::not_eof(c)
330 : c;
331 }
332
333 /// Update the python file to reflect the state of this stream buffer
334 /** Empty the write buffer into the Python file object and set the seek
335 position of the latter accordingly (C++ standard section 27.5.2.4.2).
336 If there is no write buffer or it is empty, but there is a non-empty
337 read buffer, set the Python file object seek position to the
338 seek position in that read buffer.
339 */
340 int sync() override {
341 int result = 0;
342 farthest_pptr = std::max(farthest_pptr, pptr());
343 if (farthest_pptr && farthest_pptr > pbase()) {
344 off_type delta = pptr() - farthest_pptr;
345 int_type status = overflow();
346 if (traits_type::eq_int_type(status, traits_type::eof())) {
347 result = -1;
348 }
349 if (py_seek != bp::object()) {
350 py_seek(delta, 1);
351 }
352 } else if (gptr() && gptr() < egptr()) {
353 if (py_seek != bp::object()) {
354 py_seek(gptr() - egptr(), 1);
355 }
356 }
357 return result;
358 }
359
360 /// C.f. C++ standard section 27.5.2.4.2
361 /** This implementation is optimised to look whether the position is within
362 the buffers, so as to avoid calling Python seek or tell. It is
363 important for many applications that the overhead of calling into Python
364 is avoided as much as possible (e.g. parsers which may do a lot of
365 backtracking)
366 */
367 pos_type seekoff(off_type off, std::ios_base::seekdir way,
368 std::ios_base::openmode which =
369 std::ios_base::in | std::ios_base::out) override {
370 /* In practice, "which" is either std::ios_base::in or out
371 since we end up here because either seekp or seekg was called
372 on the stream using this buffer. That simplifies the code
373 in a few places.
374 */
375 int const failure = off_type(-1);
376
377 if (py_seek == bp::object()) {
378 throw std::invalid_argument(
379 "That Python file object has no 'seek' attribute");
380 }
381
382 // we need the read buffer to contain something!
383 if (which == std::ios_base::in && !gptr()) {
384 if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
385 return failure;
386 }
387 }
388
389 // compute the whence parameter for Python seek
390 int whence;
391 switch (way) {
392 case std::ios_base::beg:
393 whence = 0;
394 break;
395 case std::ios_base::cur:
396 whence = 1;
397 break;
398 case std::ios_base::end:
399 whence = 2;
400 break;
401 default:
402 return failure;
403 }
404
405 // Let's have a go
406 boost::optional<off_type> result =
407 seekoff_without_calling_python(off, way, which);
408 if (!result) {
409 // we need to call Python
410 if (which == std::ios_base::out) {
411 overflow();
412 }
413 if (way == std::ios_base::cur) {
414 if (which == std::ios_base::in) {
415 off -= egptr() - gptr();
416 } else if (which == std::ios_base::out) {
417 off += pptr() - pbase();
418 }
419 }
420 py_seek(off, whence);
421 result = off_type(bp::extract<off_type>(py_tell()));
422 if (which == std::ios_base::in) {
423 underflow();
424 }
425 }
426 return *result;
427 }
428
429 /// C.f. C++ standard section 27.5.2.4.2
431 std::ios_base::openmode which =
432 std::ios_base::in | std::ios_base::out) override {
433 return streambuf::seekoff(sp, std::ios_base::beg, which);
434 }
435
436 private:
437 bp::object py_read, py_write, py_seek, py_tell;
438
439 std::size_t buffer_size;
440
441 /* This is actually a Python string and the actual read buffer is
442 its internal data, i.e. an array of characters. We use a Boost.Python
443 object so as to hold on it: as a result, the actual buffer can't
444 go away.
445 */
446 bp::object read_buffer;
447
448 /* A mere array of char's allocated on the heap at construction time and
449 de-allocated only at destruction time.
450 */
451 char* write_buffer;
452 bool df_isTextMode;
453
454 off_type pos_of_read_buffer_end_in_py_file,
455 pos_of_write_buffer_end_in_py_file;
456
457 // the farthest place the buffer has been written into
458 char* farthest_pptr;
459
460 boost::optional<off_type> seekoff_without_calling_python(
461 off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
462 boost::optional<off_type> const failure;
463
464 // Buffer range and current position
465 off_type buf_begin, buf_end, buf_cur, upper_bound;
466 off_type pos_of_buffer_end_in_py_file;
467 if (which == std::ios_base::in) {
468 pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
469 buf_begin = reinterpret_cast<std::streamsize>(eback());
470 buf_cur = reinterpret_cast<std::streamsize>(gptr());
471 buf_end = reinterpret_cast<std::streamsize>(egptr());
472 upper_bound = buf_end;
473 } else if (which == std::ios_base::out) {
474 pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
475 buf_begin = reinterpret_cast<std::streamsize>(pbase());
476 buf_cur = reinterpret_cast<std::streamsize>(pptr());
477 buf_end = reinterpret_cast<std::streamsize>(epptr());
478 farthest_pptr = std::max(farthest_pptr, pptr());
479 upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
480 } else {
481 CHECK_INVARIANT(0, "unreachable code");
482 }
483
484 // Sought position in "buffer coordinate"
485 off_type buf_sought;
486 if (way == std::ios_base::cur) {
487 buf_sought = buf_cur + off;
488 } else if (way == std::ios_base::beg) {
489 buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
490 } else if (way == std::ios_base::end) {
491 return failure;
492 } else {
493 CHECK_INVARIANT(0, "unreachable code");
494 }
495
496 // if the sought position is not in the buffer, give up
497 if (buf_sought < buf_begin || buf_sought >= upper_bound) {
498 return failure;
499 }
500
501 // we are in wonderland
502 if (which == std::ios_base::in) {
503 gbump(buf_sought - buf_cur);
504 } else if (which == std::ios_base::out) {
505 pbump(buf_sought - buf_cur);
506 }
507 return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
508 }
509
510 public:
511 class istream : public std::istream {
512 public:
513 istream(streambuf& buf) : std::istream(&buf) {
514 exceptions(std::ios_base::badbit);
515 }
516
517 ~istream() override {
518 // do nothing.
519 // This used to do:
520 // if (this->good()) this->sync();
521 // but that caused problems if the underlying file had been closed
522 // (see github #579) and really doesn't seem necessary for what we're
523 // doing.
524 }
525 };
526
527 class ostream : public std::ostream {
528 public:
529 ostream(streambuf& buf) : std::ostream(&buf) {
530 exceptions(std::ios_base::badbit);
531 }
532
533 ~ostream() override {
534 if (this->good()) {
535 this->flush();
536 }
537 }
538 };
539};
540
541// std::size_t streambuf::default_buffer_size = 1024;
542
545
546 streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
547 : python_streambuf(python_file_obj, buffer_size) {}
548};
549
551 ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
552 : streambuf_capsule(python_file_obj, buffer_size),
554
555 ~ostream() noexcept override {
556 if (this->good()) {
557 this->flush();
558 }
559 }
560};
561} // namespace python
562} // namespace boost_adaptbx
563
564#endif // GUARD
#define TEST_ASSERT(expr)
Definition: Invariant.h:152
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
A stream buffer getting data from and putting data into a Python file object.
~streambuf() override
Mundane destructor freeing the allocated resources.
static const std::size_t default_buffer_size
The default size of the read and write buffer.
pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
std::streamsize showmanyc() override
C.f. C++ standard section 27.5.2.4.3.
pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
int sync() override
Update the python file to reflect the state of this stream buffer.
int_type overflow(int_type c=traits_type_eof()) override
C.f. C++ standard section 27.5.2.4.5.
int_type underflow() override
C.f. C++ standard section 27.5.2.4.3.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)