BUG: convert numpy strings in index names in HDF #13492 (#16444)

* BUG: Handle numpy strings in index names in HDF5 #13492 * REF: refactor to _ensure_str
pandas-dev · abarber4gh · May 24, 2017 · May 23, 2017 · May 25, 2017 · May 25, 2017
commit 18c316b6fba1e00ae60b571304ffd1d0a00fc9a7
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -76,6 +76,7 @@ I/O
 - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`)
 - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
 - Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`)
+- Bug where ``pd.read_hdf()`` returns numpy strings for index names (:issue:`13492`)
 
 - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`)
 

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -73,6 +73,18 @@ def _ensure_encoding(encoding):
     return encoding
 
 
+def _ensure_str(name):
+    """Ensure that an index / column name is a str (python 3) or
+    unicode (python 2); otherwise they may be np.string dtype.
+    Non-string dtypes are passed through unchanged.
+
+    https://github.com/pandas-dev/pandas/issues/13492
+    """
+    if isinstance(name, compat.string_types):
+        name = compat.text_type(name)
+    return name
+
+
 Term = Expr
 
 
@@ -2567,7 +2579,7 @@ def read_index_node(self, node, start=None, stop=None):
         name = None
 
         if 'name' in node._v_attrs:
-            name = node._v_attrs.name
+            name = _ensure_str(node._v_attrs.name)
 
         index_class = self._alias_to_class(getattr(node._v_attrs,
                                                    'index_class', ''))

diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -16,7 +16,7 @@
                     date_range, timedelta_range, Index, DatetimeIndex,
                     isnull)
 
-from pandas.compat import is_platform_windows, PY3, PY35, BytesIO
+from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type
 from pandas.io.formats.printing import pprint_thing
 
 tables = pytest.importorskip('tables')
@@ -2920,6 +2920,27 @@ def test_store_index_name_with_tz(self):
             recons = store['frame']
             tm.assert_frame_equal(recons, df)
 
+    @pytest.mark.parametrize('table_format', ['table', 'fixed'])
+    def test_store_index_name_numpy_str(self, table_format):
+        # GH #13492
+        idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1),
+                                       datetime.date(2000, 1, 2)]),
+                       name=u('cols\u05d2'))
+        idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1),
+                                        datetime.date(2010, 1, 2)]),
+                        name=u('rows\u05d0'))
+        df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1)
+
+        # This used to fail, returning numpy strings instead of python strings.
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format=table_format)
+            df2 = read_hdf(path, 'df')
+
+            assert_frame_equal(df, df2, check_names=True)
+
+            assert type(df2.index.name) == text_type
+            assert type(df2.columns.name) == text_type
+
     def test_store_series_name(self):
         df = tm.makeDataFrame()
         series = df['A']