Fix performance degradation with line protocol (influxdata#592)

shushen · xginn8 · commit c300105d906d · 2018-06-30T12:24:06.000-04:00
Assemble line by line in the commit bf232a7 to remove NaN has significant performance impact. This change fixes the issue by keeping the NaN fields before stringify the dataframe, replacing the fields with empty string, and reverting back to use pd.DataFrame.sum() function to yield the lines. Fixes: influxdata#591
diff --git a/influxdb/_dataframe_client.py b/influxdb/_dataframe_client.py
@@ -365,16 +365,18 @@ def _convert_dataframe_to_lines(self,
 
         # Make an array of formatted field keys and values
         field_df = dataframe[field_columns]
+        # Keep the positions where Null values are found
+        mask_null = field_df.isnull().values
 
         field_df = self._stringify_dataframe(field_df,
                                              numeric_precision,
                                              datatype='field')
 
-        def format_line(line):
-            line = line[~line.isnull()]  # drop None entries
-            return ",".join((line.index + '=' + line.values))
-
-        fields = field_df.apply(format_line, axis=1)
+        field_df = (field_df.columns.values + '=').tolist() + field_df
+        field_df[field_df.columns[1:]] = ',' + field_df[
+            field_df.columns[1:]]
+        field_df = field_df.where(~mask_null, '')  # drop Null entries
+        fields = field_df.sum(axis=1)
         del field_df
 
         # Generate line protocol string
@@ -388,9 +390,6 @@ def _stringify_dataframe(dframe, numeric_precision, datatype='field'):
         # Prevent modification of input dataframe
         dframe = dframe.copy()
 
-        # Keep the positions where Null values are found
-        mask_null = dframe.isnull().values
-
         # Find int and string columns for field-type data
         int_columns = dframe.select_dtypes(include=['integer']).columns
         string_columns = dframe.select_dtypes(include=['object']).columns
@@ -435,7 +434,6 @@ def _stringify_dataframe(dframe, numeric_precision, datatype='field'):
 
         dframe.columns = dframe.columns.astype(str)
 
-        dframe = dframe.where(~mask_null, None)
         return dframe
 
     def _datetime_to_epoch(self, datetime, time_precision='s'):