Use flagfile to pass flags to data async generation process.

shizhiw · shizhiw · commit c88fcb2bbb9c · 2018-10-11T13:01:02.000-07:00
diff --git a/official/recommendation/data_async_generation.py b/official/recommendation/data_async_generation.py
@@ -51,7 +51,7 @@
 
 def log_msg(msg):
   """Include timestamp info when logging messages to a file."""
-  if flags.FLAGS.use_command_file:
+  if flags.FLAGS.use_tf_logging:
     tf.logging.info(msg)
     return
 
@@ -440,44 +440,26 @@ def remove_alive_file():
     gc.collect()
 
 
-def _set_flags_with_command_file():
-  """Use arguments from COMMAND_FILE when use_command_file is True."""
-  command_file = os.path.join(flags.FLAGS.data_dir,
-                              rconst.COMMAND_FILE)
-  tf.logging.info("Waiting for command file to appear at {}..."
-                  .format(command_file))
-  while not tf.gfile.Exists(command_file):
+def _parse_flagfile():
+  """Fill flags with flagfile."""
+  flagfile = os.path.join(flags.FLAGS.data_dir,
+                          rconst.FLAGFILE)
+  tf.logging.info("Waiting for flagfile to appear at {}..."
+                  .format(flagfile))
+  while not tf.gfile.Exists(flagfile):
     time.sleep(1)
-  tf.logging.info("Command file found.")
-  with tf.gfile.Open(command_file, "r") as f:
-    command = json.load(f)
-  flags.FLAGS.num_workers = command["num_workers"]
-  assert flags.FLAGS.data_dir == command["data_dir"]
-  flags.FLAGS.cache_id = command["cache_id"]
-  flags.FLAGS.num_readers = command["num_readers"]
-  flags.FLAGS.num_neg = command["num_neg"]
-  flags.FLAGS.num_train_positives = command["num_train_positives"]
-  flags.FLAGS.num_items = command["num_items"]
-  flags.FLAGS.epochs_per_cycle = command["epochs_per_cycle"]
-  flags.FLAGS.train_batch_size = command["train_batch_size"]
-  flags.FLAGS.eval_batch_size = command["eval_batch_size"]
-  flags.FLAGS.spillover = command["spillover"]
-  flags.FLAGS.redirect_logs = command["redirect_logs"]
-  assert flags.FLAGS.redirect_logs is False
-  if "seed" in command:
-    flags.FLAGS.seed = command["seed"]
+  tf.logging.info("flagfile found.")
+  flags.FLAGS([__file__, "--flagfile", flagfile])
 
 
 def main(_):
   global _log_file
-  if flags.FLAGS.use_command_file is not None:
-    _set_flags_with_command_file()
+  _parse_flagfile()
 
   redirect_logs = flags.FLAGS.redirect_logs
   cache_paths = rconst.Paths(
       data_dir=flags.FLAGS.data_dir, cache_id=flags.FLAGS.cache_id)
 
-
   log_file_name = "data_gen_proc_{}.log".format(cache_paths.cache_id)
   log_path = os.path.join(cache_paths.data_dir, log_file_name)
   if log_path.startswith("gs://") and redirect_logs:
@@ -559,12 +541,11 @@ def define_flags():
   flags.DEFINE_boolean(name="redirect_logs", default=False,
                        help="Catch logs and write them to a file. "
                             "(Useful if this is run as a subprocess)")
+  flags.DEFINE_boolean(name="use_tf_logging", default=False,
+                       help="Use tf.logging instead of log file.")
   flags.DEFINE_integer(name="seed", default=None,
                        help="NumPy random seed to set at startup. If not "
                             "specified, a seed will not be set.")
-  flags.DEFINE_boolean(name="use_command_file", default=False,
-                       help="Use command arguments from json at command_path. "
-                       "All arguments other than data_dir will be ignored.")
 
 
 if __name__ == "__main__":
diff --git a/official/recommendation/data_preprocessing.py b/official/recommendation/data_preprocessing.py
@@ -430,77 +430,54 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
   # pool underlying the training generation doesn't starve other processes.
   num_workers = int(multiprocessing.cpu_count() * 0.75) or 1
 
+  flags_ = {
+      "data_dir": data_dir,
+      "cache_id": ncf_dataset.cache_paths.cache_id,
+      "num_neg": num_neg,
+      "num_train_positives": ncf_dataset.num_train_positives,
+      "num_items": ncf_dataset.num_items,
+      "num_readers": ncf_dataset.num_data_readers,
+      "epochs_per_cycle": epochs_per_cycle,
+      "train_batch_size": batch_size,
+      "eval_batch_size": eval_batch_size,
+      "num_workers": num_workers,
+      # This allows the training input function to guarantee batch size and
+      # significantly improves performance. (~5% increase in examples/sec on
+      # GPU, and needed for TPU XLA.)
+      "spillover": True,
+      "redirect_logs": use_subprocess,
+      "use_tf_logging": not use_subprocess,
+  }
+  if ncf_dataset.deterministic:
+    flags_["seed"] = stat_utils.random_int32()
+  # We write to a temp file then atomically rename it to the final file,
+  # because writing directly to the final file can cause the data generation
+  # async process to read a partially written JSON file.
+  flagfile_temp = os.path.join(flags.FLAGS.data_dir, rconst.FLAGFILE_TEMP)
+  tf.logging.info("Preparing flagfile for async data generation in {} ..."
+                  .format(flagfile_temp))
+  with tf.gfile.Open(flagfile_temp, "w") as f:
+    for k, v in six.iteritems(flags_):
+      f.write("--{}={}\n".format(k, v))
+  flagfile = os.path.join(data_dir, rconst.FLAGFILE)
+  tf.gfile.Rename(flagfile_temp, flagfile)
+  tf.logging.info(
+      "Wrote flagfile for async data generation in {}."
+      .format(flagfile))
+
   if use_subprocess:
     tf.logging.info("Creating training file subprocess.")
-
     subproc_env = os.environ.copy()
-
     # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
     # resources and by default will try to allocate GPU memory. This would cause
     # contention with the main training process.
     subproc_env["CUDA_VISIBLE_DEVICES"] = ""
-
     subproc_args = popen_helper.INVOCATION + [
-        "--data_dir", data_dir,
-        "--cache_id", str(ncf_dataset.cache_paths.cache_id),
-        "--num_neg", str(num_neg),
-        "--num_train_positives", str(ncf_dataset.num_train_positives),
-        "--num_items", str(ncf_dataset.num_items),
-        "--num_readers", str(ncf_dataset.num_data_readers),
-        "--epochs_per_cycle", str(epochs_per_cycle),
-        "--train_batch_size", str(batch_size),
-        "--eval_batch_size", str(eval_batch_size),
-        "--num_workers", str(num_workers),
-        # This allows the training input function to guarantee batch size and
-        # significantly improves performance. (~5% increase in examples/sec on
-        # GPU, and needed for TPU XLA.)
-        "--spillover", "True",
-        "--redirect_logs", "True"
-    ]
-    if ncf_dataset.deterministic:
-      subproc_args.extend(["--seed", str(int(stat_utils.random_int32()))])
-
+        "--data_dir", data_dir]
     tf.logging.info(
         "Generation subprocess command: {}".format(" ".join(subproc_args)))
-
     proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)
 
-  else:
-    # We write to a temp file then atomically rename it to the final file,
-    # because writing directly to the final file can cause the data generation
-    # async process to read a partially written JSON file.
-    command_file_temp = os.path.join(data_dir, rconst.COMMAND_FILE_TEMP)
-    tf.logging.info("Generation subprocess command at {} ..."
-                    .format(command_file_temp))
-    with tf.gfile.Open(command_file_temp, "w") as f:
-      command = {
-          "data_dir": data_dir,
-          "cache_id": ncf_dataset.cache_paths.cache_id,
-          "num_neg": num_neg,
-          "num_train_positives": ncf_dataset.num_train_positives,
-          "num_items": ncf_dataset.num_items,
-          "num_readers": ncf_dataset.num_data_readers,
-          "epochs_per_cycle": epochs_per_cycle,
-          "train_batch_size": batch_size,
-          "eval_batch_size": eval_batch_size,
-          "num_workers": num_workers,
-          # This allows the training input function to guarantee batch size and
-          # significantly improves performance. (~5% increase in examples/sec on
-          # GPU, and needed for TPU XLA.)
-          "spillover": True,
-          "redirect_logs": False
-      }
-      if ncf_dataset.deterministic:
-        command["seed"] = stat_utils.random_int32()
-
-      json.dump(command, f)
-    command_file = os.path.join(data_dir, rconst.COMMAND_FILE)
-    tf.gfile.Rename(command_file_temp, command_file)
-
-    tf.logging.info(
-        "Generation subprocess command saved to: {}"
-        .format(command_file))
-
   cleanup_called = {"finished": False}
   @atexit.register
   def cleanup():