Skip to content

Instantly share code, notes, and snippets.

Loading pipeline components...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00, 1.42it/s]
Compiling transformer
Pipeline created and compiled
Initial compilation/warmup for each resolution
Warming up at 512x512
0%| | 0/25 [00:00<?, ?it/s]INFO:torch._inductor.cudagraph_trees.__cudagraphs:recording cudagraph tree for symint key (2, 4, 64, 2, 9216, 256, 0, 1000, 256, 2, 4)
DEBUG:torch._inductor.cudagraph_trees.__cudagraphs:Running warmup of function 0
4%|█████▍ | 1/25 [05:49<2:19:45, 349.40s/it]DEBUG:torch._inductor.cudagraph_trees.__cudagraphs:Recording function 0 of graph recording id 0
100%|█████████████████████████████████████████████████████████
Loading pipeline components...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 9.31it/s]
Compiling transformer
Pipeline created and compiled
Initial compilation/warmup for each resolution
Warming up at 512x512
0%| | 0/25 [00:00<?, ?it/s]INFO:torch._inductor.cudagraph_trees.__cudagraphs:recording cudagraph tree for symint key (2, 4, 64, 2, 9216, 256, 0, 1000, 256, 2, 4)
DEBUG:torch._inductor.cudagraph_trees.__cudagraphs:Running warmup of function 0
4%|█████▍ | 1/25 [04:08<1:39:30, 248.79s/it]DEBUG:torch._inductor.cudagraph_trees.__cudagraphs:Recording function 0 of graph recording id 0
100%|█████████████████████████████████████████████████████████
Loading pipeline components...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00, 1.42it/s]
Compiling transformer
Pipeline created and compiled
Initial compilation/warmup for each resolution
Warming up at 512x512
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [04:17<00:00, 10.31s/it]
Warming up at 1024x1024
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:05<00:00, 4.87it/s]
Warming up at 1280x1024
def count_consecutive_patterns(filename):
all_reduce_pattern = "\" = torch.ops._c10d_functional.all_reduce("
all_to_all_pattern = "\" = torch.ops._c10d_functional.all_to_all_single("
current_streak = 0
current_pattern = None
current_start_line = None
results = []
line_number = 0
pattern_matches = [] # Store all pattern matches with line numbers
rank29.txt rank7.txt
------------------------------------------------------------------------------------------------------------------------
all_reduce appeared 376 times in a row (starting line 3165) all_reduce appeared 376 times in a row (starting line 3059)
all_to_all appeared 1 times in a row (starting line 14578) all_to_all appeared 1 times in a row (starting line 14472)
all_reduce appeared 23 times in a row (starting line 14665) all_reduce appeared 23 times in a row (starting line 14558)
all_to_all appeared 1 times in a row (starting line 15689) all_to_all appeared 1 times in a row (starting line 15582)
all_reduce appeared 23 times in a row (starting line 15776) all_reduce appeared 23 times in a row (starting line 15668)
all_to_all appeared 1 times in a row (starting line 16801) all_to_all appeared 1 times in a row (starting line 16693)
all_reduce appeared 23 times in a row (starting l
import torch
import torch.distributed as dist
torch._dynamo.config.enable_compiler_collectives = True
dist.init_process_group(backend="nccl")
rank = dist.get_rank()
torch.cuda.set_device(rank)
@torch.compile(backend="eager", fullgraph=True)
Traceback (most recent call last):
File "/home/xmfan/empathy/coqui-ai-TTS/xtts2.py", line 23, in <module>
warmup()
File "/home/xmfan/.conda/envs/empathy310/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 465, in _fn
return fn(*args, **kwargs)
File "/home/xmfan/empathy/coqui-ai-TTS/xtts2.py", line 19, in warmup
fn()
File "/home/xmfan/empathy/coqui-ai-TTS/xtts2.py", line 13, in fn
tts.tts(text="Hello from XTTS2. I am being tested for the torch.compile User Empathy Day on Nov 20th 2024.", speaker_wav="en_sample.wav", language="en")
File "/home/xmfan/empathy/coqui-ai-TTS/TTS/api.py", line 276, in tts
@xmfan
xmfan / fusedbwdoptim.py
Created November 19, 2024 18:19
Fused backward + Simple optimizer implementation
import torch
import torch.nn as nn
torch._dynamo.config.compiled_autograd = True
@torch.compile
def train(model, x):
loss = model(x).sum()
loss.backward()
for param in model.parameters():
param.grad = None
import os
import functools
import torch
import torch.nn as nn
import torch.distributed as dist
from torch._dynamo import compiled_autograd
from torch.distributed.distributed_c10d import _get_default_group
FILE_PATH = "/tmp/chienchin_rendezvous"
DIM=2000
# AOT ID: ['1_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile