|
16 | 16 | from pandas.core.config import get_option |
17 | 17 | from pandas.io.date_converters import generic_parser |
18 | 18 | from pandas.io.common import get_filepath_or_buffer |
| 19 | +from pandas.tseries import tools |
19 | 20 |
|
20 | 21 | from pandas.util.decorators import Appender |
21 | 22 |
|
|
143 | 144 | warn_bad_lines: boolean, default True |
144 | 145 | If error_bad_lines is False, and warn_bad_lines is True, a warning for each |
145 | 146 | "bad line" will be output. (Only valid with C parser). |
| 147 | +infer_datetime_format : boolean, default False |
| 148 | + If True and parse_dates is enabled for a column, attempt to infer |
| 149 | + the datetime format to speed up the processing |
146 | 150 |
|
147 | 151 | Returns |
148 | 152 | ------- |
@@ -262,6 +266,7 @@ def _read(filepath_or_buffer, kwds): |
262 | 266 | 'compression': None, |
263 | 267 | 'mangle_dupe_cols': True, |
264 | 268 | 'tupleize_cols': False, |
| 269 | + 'infer_datetime_format': False, |
265 | 270 | } |
266 | 271 |
|
267 | 272 |
|
@@ -349,7 +354,8 @@ def parser_f(filepath_or_buffer, |
349 | 354 | encoding=None, |
350 | 355 | squeeze=False, |
351 | 356 | mangle_dupe_cols=True, |
352 | | - tupleize_cols=False): |
| 357 | + tupleize_cols=False, |
| 358 | + infer_datetime_format=False): |
353 | 359 |
|
354 | 360 | # Alias sep -> delimiter. |
355 | 361 | if delimiter is None: |
@@ -408,7 +414,8 @@ def parser_f(filepath_or_buffer, |
408 | 414 | low_memory=low_memory, |
409 | 415 | buffer_lines=buffer_lines, |
410 | 416 | mangle_dupe_cols=mangle_dupe_cols, |
411 | | - tupleize_cols=tupleize_cols) |
| 417 | + tupleize_cols=tupleize_cols, |
| 418 | + infer_datetime_format=infer_datetime_format) |
412 | 419 |
|
413 | 420 | return _read(filepath_or_buffer, kwds) |
414 | 421 |
|
@@ -665,9 +672,13 @@ def __init__(self, kwds): |
665 | 672 | self.true_values = kwds.get('true_values') |
666 | 673 | self.false_values = kwds.get('false_values') |
667 | 674 | self.tupleize_cols = kwds.get('tupleize_cols', False) |
| 675 | + self.infer_datetime_format = kwds.pop('infer_datetime_format', False) |
668 | 676 |
|
669 | | - self._date_conv = _make_date_converter(date_parser=self.date_parser, |
670 | | - dayfirst=self.dayfirst) |
| 677 | + self._date_conv = _make_date_converter( |
| 678 | + date_parser=self.date_parser, |
| 679 | + dayfirst=self.dayfirst, |
| 680 | + infer_datetime_format=self.infer_datetime_format |
| 681 | + ) |
671 | 682 |
|
672 | 683 | # validate header options for mi |
673 | 684 | self.header = kwds.get('header') |
@@ -1178,6 +1189,10 @@ def TextParser(*args, **kwds): |
1178 | 1189 | Encoding to use for UTF when reading/writing (ex. 'utf-8') |
1179 | 1190 | squeeze : boolean, default False |
1180 | 1191 | returns Series if only one column |
| 1192 | + infer_datetime_format: boolean, default False |
| 1193 | + If True and `parse_dates` is True for a column, try to infer the |
| 1194 | + datetime format based on the first datetime string. If the format |
| 1195 | + can be inferred, there often will be a large parsing speed-up. |
1181 | 1196 | """ |
1182 | 1197 | kwds['engine'] = 'python' |
1183 | 1198 | return TextFileReader(*args, **kwds) |
@@ -1870,13 +1885,19 @@ def _get_lines(self, rows=None): |
1870 | 1885 | return self._check_thousands(lines) |
1871 | 1886 |
|
1872 | 1887 |
|
1873 | | -def _make_date_converter(date_parser=None, dayfirst=False): |
| 1888 | +def _make_date_converter(date_parser=None, dayfirst=False, |
| 1889 | + infer_datetime_format=False): |
1874 | 1890 | def converter(*date_cols): |
1875 | 1891 | if date_parser is None: |
1876 | 1892 | strs = _concat_date_cols(date_cols) |
1877 | 1893 | try: |
1878 | | - return tslib.array_to_datetime(com._ensure_object(strs), |
1879 | | - utc=None, dayfirst=dayfirst) |
| 1894 | + return tools.to_datetime( |
| 1895 | + com._ensure_object(strs), |
| 1896 | + utc=None, |
| 1897 | + box=False, |
| 1898 | + dayfirst=dayfirst, |
| 1899 | + infer_datetime_format=infer_datetime_format |
| 1900 | + ) |
1880 | 1901 | except: |
1881 | 1902 | return lib.try_parse_dates(strs, dayfirst=dayfirst) |
1882 | 1903 | else: |
|
0 commit comments