`_). This example profiles the two models. Optimize a model with onnxruntime +++++++++++++++++++++++++++++++++ .. GENERATED FROM PYTHON SOURCE LINES 18-38 .. code-block:: Python import os import numpy import matplotlib.pyplot as plt from onnxruntime import get_available_providers from onnx_array_api.ext_test_case import example_path from onnx_array_api.ort.ort_optimizers import ort_optimized_model from onnx_array_api.ort.ort_profile import ort_profile, merge_ort_profile from onnx_array_api.plotting.stat_plot import plot_ort_profile suffix = "" filename = example_path(f"data/small{suffix}.onnx") optimized = filename + ".optimized.onnx" print(f"model={filename!r}") if not os.path.exists(optimized): ort_optimized_model(filename, output=optimized) print(f"optimized={optimized!r}") .. rst-class:: sphx-glr-script-out .. code-block:: none model='data/small.onnx' optimized='data/small.onnx.optimized.onnx' .. GENERATED FROM PYTHON SOURCE LINES 39-43 .. _l-example-ort-profiling: Profiling +++++++++ .. GENERATED FROM PYTHON SOURCE LINES 43-55 .. code-block:: Python feeds = {"input": numpy.random.random((1, 3, 112, 112)).astype(numpy.float32)} prof_base = ort_profile( filename, feeds, repeat=6, disable_optimization=True, providers=["CPUExecutionProvider"], ) prof_base.to_excel(f"prof_base{suffix}.xlsx", index=False) prof_base .. raw:: html

	cat	pid	tid	dur	ts	ph	name	args_op_name	op_name	args_thread_scheduling_stats	args_output_type_shape	args_output_size	args_parameter_size	args_activation_size	args_node_index	args_input_type_shape	args_provider	event_name	iteration
0	Session	85022	85022	483	3	X	model_loading_uri	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	model_loading_uri	-1
1	Session	85022	85022	466	518	X	session_initialization	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	session_initialization	-1
2	Node	85022	85022	0	1139	X	n0_fence_before	Conv	n0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_before	-1
3	Node	85022	85022	965	1141	X	n0_kernel_time	Conv	n0	{'main_thread': {'thread_pool_name': 'session-...	[{'float': [1, 64, 112, 112]}]	3211264	7168	150528	0	[{'float': [1, 3, 112, 112]}, {'float': [64, 3...	CPUExecutionProvider	kernel_time	-1
4	Node	85022	85022	0	2114	X	n0_fence_after	Conv	n0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_after	-1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
261	Node	85022	85022	0	238872	X	n13_fence_before	Add	n13	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_before	4
262	Node	85022	85022	70	238873	X	n13_kernel_time	Add	n13	{'main_thread': {'thread_pool_name': 'session-...	[{'float': [1, 64, 56, 56]}]	802816	0	1605632	13	[{'float': [1, 64, 56, 56]}, {'float': [1, 64,...	CPUExecutionProvider	kernel_time	4
263	Node	85022	85022	0	238949	X	n13_fence_after	Add	n13	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_after	4
264	Session	85022	85022	12851	226101	X	SequentialExecutor::Execute	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	SequentialExecutor::Execute	5
265	Session	85022	85022	12876	226084	X	model_run	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	model_run	5

266 rows × 19 columns

.. GENERATED FROM PYTHON SOURCE LINES 56-57 And the optimized model. .. GENERATED FROM PYTHON SOURCE LINES 57-68 .. code-block:: Python prof_opti = ort_profile( optimized, feeds, repeat=6, disable_optimization=True, providers=["CPUExecutionProvider"], ) prof_opti.to_excel(f"prof_opti{suffix}.xlsx", index=False) prof_opti .. raw:: html

	cat	pid	tid	dur	ts	ph	name	args_op_name	op_name	args_thread_scheduling_stats	args_output_type_shape	args_output_size	args_parameter_size	args_activation_size	args_node_index	args_input_type_shape	args_provider	event_name	iteration
0	Session	85022	85022	497	2	X	model_loading_uri	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	model_loading_uri	-1
1	Session	85022	85022	378	525	X	session_initialization	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	session_initialization	-1
2	Node	85022	85022	0	1023	X	r0_nchwc_fence_before	Conv	r0_nchwc	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_before	-1
3	Node	85022	85022	672	1026	X	r0_nchwc_kernel_time	Conv	r0_nchwc	{'main_thread': {'thread_pool_name': 'session-...	[{'float': [1, 64, 112, 112]}]	3211264	7168	150528	0	[{'float': [1, 3, 112, 112]}, {'float': [64, 3...	CPUExecutionProvider	kernel_time	-1
4	Node	85022	85022	0	1704	X	r0_nchwc_fence_after	Conv	r0_nchwc	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_after	-1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
351	Node	85022	85022	0	249744	X	ReorderOutput_token_16_fence_before	ReorderOutput	ReorderOutput_token_16	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_before	4
352	Node	85022	85022	48	249745	X	ReorderOutput_token_16_kernel_time	ReorderOutput	ReorderOutput_token_16	{'main_thread': {'thread_pool_name': 'session-...	[{'float': [1, 64, 56, 56]}]	802816	0	802816	18	[{'float': [1, 64, 56, 56]}]	CPUExecutionProvider	kernel_time	4
353	Node	85022	85022	0	249796	X	ReorderOutput_token_16_fence_after	ReorderOutput	ReorderOutput_token_16	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fence_after	4
354	Session	85022	85022	23834	225966	X	SequentialExecutor::Execute	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	SequentialExecutor::Execute	5
355	Session	85022	85022	23860	225951	X	model_run	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	model_run	5

356 rows × 19 columns

.. GENERATED FROM PYTHON SOURCE LINES 69-70 And the graph is: .. GENERATED FROM PYTHON SOURCE LINES 70-78 .. code-block:: Python unique_op = set(prof_base["args_op_name"]) fig, ax = plt.subplots(2, 2, figsize=(10, len(unique_op)), sharex="col") plot_ort_profile(prof_base, ax[0, 0], ax[0, 1], title="baseline") plot_ort_profile(prof_opti, ax[1, 0], ax[1, 1], title="optimized") fig.tight_layout() fig.savefig(f"plot_profiling{suffix}.png") .. image-sg:: /auto_examples/images/sphx_glr_plot_profiling_001.png :alt: baseline, n occurences, optimized, n occurences :srcset: /auto_examples/images/sphx_glr_plot_profiling_001.png :class: sphx-glr-single-img .. GENERATED FROM PYTHON SOURCE LINES 79-85 Merging profiles ++++++++++++++++ Let's try to compare both profiles assuming every iteration process the same image and the input and output size are the same at every iteration. .. GENERATED FROM PYTHON SOURCE LINES 85-90 .. code-block:: Python merge, gr = merge_ort_profile(prof_base, prof_opti) merge.to_excel(f"plot_profiling_merged{suffix}.xlsx", index=False) merge .. rst-class:: sphx-glr-script-out .. code-block:: none /home/xadupre/github/onnx-array-api/onnx_array_api/ort/ort_profile.py:260: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead. .agg( /home/xadupre/github/onnx-array-api/onnx_array_api/ort/ort_profile.py:260: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead. .agg( .. raw:: html

	args_op_name	args_output_type_shape	args_input_type_shape	args_provider	idx	durbase	countbase	duropti	countopti
0	Add	[{'float': [1, 64, 56, 56]}]	[{'float': [1, 64, 56, 56]}, {'float': [1, 64,...	CPUExecutionProvider	0	2420.0	6.0	NaN	NaN
1	BatchNormalization	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}, {'float': [64]}...	CPUExecutionProvider	0	2548.0	6.0	8064.0	6.0
2	Concat	[{'float': [1, 2, 112, 112]}]	[{'float': [1, 1, 112, 112]}, {'float': [1, 1,...	CPUExecutionProvider	0	166.0	6.0	127.0	6.0
3	Conv	[{'float': [1, 1, 112, 112]}]	[{'float': [1, 2, 112, 112]}, {'float': [1, 2,...	CPUExecutionProvider	0	2497.0	6.0	NaN	NaN
4	Conv	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 3, 112, 112]}, {'float': [64, 3...	CPUExecutionProvider	0	5364.0	6.0	4311.0	6.0
5	Conv	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	0	101656.0	6.0	102240.0	6.0
6	Conv	[{'float': [1, 64, 56, 56]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	0	9524.0	6.0	1767.0	6.0
7	Conv	[{'float': [1, 64, 56, 56]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	0	NaN	NaN	53992.0	6.0
8	Conv	[{'float': [1, 64, 56, 56]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	0	59088.0	6.0	NaN	NaN
9	Conv	[{'float': [1, 8, 112, 112]}]	[{'float': [1, 2, 112, 112]}, {'float': [8, 2,...	CPUExecutionProvider	0	NaN	NaN	19992.0	6.0
10	Mul	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 1, 112, 112]}, {'float': [1, 64...	CPUExecutionProvider	0	2140.0	6.0	5791.0	6.0
11	PRelu	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	0	12364.0	6.0	1828.0	6.0
12	PRelu	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}, {'float': [64, ...	CPUExecutionProvider	1	2347.0	6.0	1783.0	6.0
13	ReduceMax	[{'float': [1, 1, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	0	13900.0	6.0	23211.0	6.0
14	ReduceMean	[{'float': [1, 1, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	0	20925.0	6.0	5553.0	6.0
15	ReorderInput	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	0	NaN	NaN	6279.0	6.0
16	ReorderInput	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	1	NaN	NaN	1533.0	6.0
17	ReorderInput	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	2	NaN	NaN	1280.0	6.0
18	ReorderOutput	[{'float': [1, 1, 112, 112]}]	[{'float': [1, 8, 112, 112]}]	CPUExecutionProvider	0	NaN	NaN	146.0	6.0
19	ReorderOutput	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	0	NaN	NaN	6803.0	6.0
20	ReorderOutput	[{'float': [1, 64, 112, 112]}]	[{'float': [1, 64, 112, 112]}]	CPUExecutionProvider	1	NaN	NaN	1323.0	6.0
21	ReorderOutput	[{'float': [1, 64, 56, 56]}]	[{'float': [1, 64, 56, 56]}]	CPUExecutionProvider	0	NaN	NaN	931.0	6.0
22	Sigmoid	[{'float': [1, 1, 112, 112]}]	[{'float': [1, 1, 112, 112]}]	CPUExecutionProvider	0	310.0	6.0	NaN	NaN

.. GENERATED FROM PYTHON SOURCE LINES 91-92 More detailed .. GENERATED FROM PYTHON SOURCE LINES 92-96 .. code-block:: Python gr.to_excel(f"plot_profiling_merged_details{suffix}.xlsx", index=False) gr .. raw:: html

	durbase	duropti	countbase	countopti
label
[+CPU]Conv(f-1x2x112x112,f-8x2x7x7)->f-1x8x112x112	0.0	19992.0	0.0	6.0
[+CPU]Conv(f-1x64x112x112,f-64x64x3x3,f-64,f-1x64x56x56)->f-1x64x56x56	0.0	53992.0	0.0	6.0
[+CPU]ReorderInput(f-1x64x112x112)->f-1x64x112x112	0.0	9092.0	0.0	18.0
[+CPU]ReorderOutput(f-1x64x112x112)->f-1x64x112x112	0.0	8126.0	0.0	12.0
[+CPU]ReorderOutput(f-1x64x56x56)->f-1x64x56x56	0.0	931.0	0.0	6.0
[+CPU]ReorderOutput(f-1x8x112x112)->f-1x1x112x112	0.0	146.0	0.0	6.0
[-CPU]Add(f-1x64x56x56,f-1x64x56x56)->f-1x64x56x56	2420.0	0.0	6.0	0.0
[-CPU]Conv(f-1x2x112x112,f-1x2x7x7)->f-1x1x112x112	2497.0	0.0	6.0	0.0
[-CPU]Conv(f-1x64x112x112,f-64x64x3x3,f-64)->f-1x64x56x56	59088.0	0.0	6.0	0.0
[-CPU]Sigmoid(f-1x1x112x112)->f-1x1x112x112	310.0	0.0	6.0	0.0
[=CPU]BatchNormalization(f-1x64x112x112,f-64,f-64,f-64,f-64)->f-1x64x112x112	2548.0	8064.0	6.0	6.0
[=CPU]Concat(f-1x1x112x112,f-1x1x112x112)->f-1x2x112x112	166.0	127.0	6.0	6.0
[=CPU]Conv(f-1x3x112x112,f-64x3x3x3,f-64)->f-1x64x112x112	5364.0	4311.0	6.0	6.0
[=CPU]Conv(f-1x64x112x112,f-64x64x1x1,f-64)->f-1x64x56x56	9524.0	1767.0	6.0	6.0
[=CPU]Conv(f-1x64x112x112,f-64x64x3x3,f-64)->f-1x64x112x112	101656.0	102240.0	6.0	6.0
[=CPU]Mul(f-1x1x112x112,f-1x64x112x112)->f-1x64x112x112	2140.0	5791.0	6.0	6.0
[=CPU]PRelu(f-1x64x112x112,f-64x1x1)->f-1x64x112x112	14711.0	3611.0	12.0	12.0
[=CPU]ReduceMax(f-1x64x112x112)->f-1x1x112x112	13900.0	23211.0	6.0	6.0
[=CPU]ReduceMean(f-1x64x112x112)->f-1x1x112x112	20925.0	5553.0	6.0	6.0

.. GENERATED FROM PYTHON SOURCE LINES 97-99 Final plot ++++++++++ .. GENERATED FROM PYTHON SOURCE LINES 99-117 .. code-block:: Python # let's filter out unsignificant operator. grmax = gr["durbase"] + gr["duropti"] total = grmax.sum() grmax /= total gr = gr[grmax >= 0.01] fig, ax = plt.subplots(1, 2, figsize=(14, min(gr.shape[0], 500)), sharey=True) gr[["durbase", "duropti"]].plot.barh(ax=ax[0]) ax[0].set_title("Side by side duration") gr = gr.copy() gr[["countbase", "countopti"]].plot.barh(ax=ax[1]) ax[1].set_title("Side by side count") fig.tight_layout() fig.savefig(f"plot_profiling_side_by_side{suffix}.png") .. image-sg:: /auto_examples/images/sphx_glr_plot_profiling_002.png :alt: Side by side duration, Side by side count :srcset: /auto_examples/images/sphx_glr_plot_profiling_002.png :class: sphx-glr-single-img .. GENERATED FROM PYTHON SOURCE LINES 118-120 On CUDA +++++++ .. GENERATED FROM PYTHON SOURCE LINES 120-172 .. code-block:: Python if "CUDAExecutionProvider" in get_available_providers(): print("Profiling on CUDA") prof_base = ort_profile( filename, feeds, repeat=6, disable_optimization=True, providers=["CUDAExecutionProvider"], ) prof_base.to_excel(f"prof_cuda_base{suffix}.xlsx", index=False) prof_opti = ort_profile( optimized, feeds, repeat=6, disable_optimization=True, providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) prof_opti.to_excel(f"prof_cuda_opti{suffix}.xlsx", index=False) unique_op = set(prof_base["args_op_name"]) fig, ax = plt.subplots(2, 2, figsize=(10, len(unique_op)), sharex="col") plot_ort_profile(prof_base, ax[0, 0], ax[0, 1], title="baseline") plot_ort_profile(prof_opti, ax[1, 0], ax[1, 1], title="optimized") fig.tight_layout() fig.savefig(f"plot_profiling_cuda{suffix}.png") merge, gr = merge_ort_profile(prof_base, prof_opti) merge.to_excel(f"plot_profiling_merged{suffix}.xlsx", index=False) gr.to_excel(f"plot_profiling_merged_details{suffix}.xlsx", index=False) grmax = gr["durbase"] + gr["duropti"] total = grmax.sum() grmax /= total gr = gr[grmax >= 0.01] fig, ax = plt.subplots(1, 2, figsize=(14, min(gr.shape[0], 500)), sharey=True) gr[["durbase", "duropti"]].plot.barh(ax=ax[0]) ax[0].set_title("Side by side duration") gr = gr.copy() gr[["countbase", "countopti"]].plot.barh(ax=ax[1]) ax[1].set_title("Side by side count") fig.tight_layout() fig.savefig(f"plot_profiling_side_by_side_cuda{suffix}.png") else: print(f"CUDA not available in {get_available_providers()}.") fig, ax = None, None ax .. rst-class:: sphx-glr-horizontal * .. image-sg:: /auto_examples/images/sphx_glr_plot_profiling_003.png :alt: baseline, n occurences, optimized, n occurences :srcset: /auto_examples/images/sphx_glr_plot_profiling_003.png :class: sphx-glr-multi-img * .. image-sg:: /auto_examples/images/sphx_glr_plot_profiling_004.png :alt: Side by side duration, Side by side count :srcset: /auto_examples/images/sphx_glr_plot_profiling_004.png :class: sphx-glr-multi-img .. rst-class:: sphx-glr-script-out .. code-block:: none Profiling on CUDA /home/xadupre/github/onnx-array-api/onnx_array_api/ort/ort_profile.py:260: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead. .agg( /home/xadupre/github/onnx-array-api/onnx_array_api/ort/ort_profile.py:260: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead. .agg( array([, ], dtype=object) .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 6.359 seconds) .. _sphx_glr_download_auto_examples_plot_profiling.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: plot_profiling.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: plot_profiling.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: plot_profiling.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_