Skip to content

PERF: changed default value of cache parameter to True in to_datetime function #26043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jul 4, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
corrected the code due to the reviewers comments
  • Loading branch information
anmyachev committed Jul 3, 2019
commit 417e005d3432c2763d688fa91ce623709dccfb3f
26 changes: 16 additions & 10 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pandas._libs.tslibs.strptime import array_strptime
from pandas.util._decorators import deprecate_kwarg

from pandas.core.algorithms import unique
from pandas.core.dtypes.common import (
ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
Expand Down Expand Up @@ -42,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def do_cache(arg, check_count: int, unique_share: float):
def should_cache(arg, check_count: int, unique_share: float):
"""
Decides whether to do caching.

Expand All @@ -51,20 +52,25 @@ def do_cache(arg, check_count: int, unique_share: float):

Parameters
----------
arg: list, tuple, 1-d array, Series
arg: listlike, tuple, 1-d array, Series
check_count: int
0 < check_count <= len(arg)
unique_share: float
0 < unique_share < 1

Returns
-------
: bool
do_caching: bool
"""
from pandas.core.algorithms import unique
assert 0 < check_count <= len(arg)
assert 0 < unique_share < 1

unique = unique(arg[:check_count])
if len(unique) > check_count * unique_share:
return False
return True
do_caching = True

unique_elements = unique(arg[:check_count])
if len(unique_elements) > check_count * unique_share:
do_caching = False
return do_caching


def _maybe_cache(arg, format, cache, convert_listlike):
Expand All @@ -73,7 +79,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
arg : listlike, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
Expand All @@ -92,7 +98,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
# Perform a quicker unique check
from pandas import Index

if not do_cache(arg, int(len(arg) * 0.1), 0.7):
if not should_cache(arg, int(len(arg) * 0.1), 0.7):
return cache_array

unique_dates = Index(arg).unique()
Expand Down