Initializing Collective Communication
If you call an HCCL API such as get_local_rank_id, get_rank_size, or get_rank_id before calling sess.run() or estimator.train(), you need to start another session and execute initialize_system to initialize collective communication. After the training is complete, execute shutdown_system and close the session.
import tensorflow as tf from npu_bridge.estimator import npu_ops from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig npu_int = npu_ops.initialize_system() npu_shutdown = npu_ops.shutdown_system() # If other run parameters (for example, profiling or dumping) need to be set for training, you can pass them here. config = tf.ConfigProto() custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # Disable remapping. init_sess = tf.Session(config=config) init_sess.run(npu_int) # Call an HCCL API... # Perform training. If another session is started to perform training, set the same run parameters as the preceding ones. init_sess.run(npu_shutdown) init_sess.close()
Or:
import tensorflow as tf from npu_bridge.estimator import npu_ops from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig npu_init = npu_ops.initialize_system() npu_shutdown = npu_ops.shutdown_system() # If other run parameters (for example, profiling or dumping) need to be set for training, you can pass them here. config = tf.ConfigProto() custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # Disable remapping. with tf.Session(config=config) as sess: sess.run(npu_init) # Call an HCCL API... # Perform training... sess.run(npu_shutdown)