code:gpt_example
차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
양쪽 이전 판이전 판다음 판 | 이전 판 | ||
code:gpt_example [2020/07/29 18:18] – rex8312 | code:gpt_example [2024/03/23 02:42] (현재) – 바깥 편집 127.0.0.1 | ||
---|---|---|---|
줄 1: | 줄 1: | ||
- | ====== | + | ====== Example: GPT ====== |
* 참고 | * 참고 | ||
+ | * https:// | ||
+ | * https:// | ||
* https:// | * https:// | ||
* https:// | * https:// | ||
* https:// | * https:// | ||
+ | ===== V2 ===== | ||
+ | <code python gpt_v2.py> | ||
+ | |||
+ | import argparse | ||
+ | import math | ||
+ | |||
+ | import numpy as np | ||
+ | import plotille | ||
+ | import torch | ||
+ | import torch.nn as nn | ||
+ | import torch.nn.functional as F | ||
+ | import torch.optim as optim | ||
+ | import tqdm | ||
+ | from gr.pygr import mlab | ||
+ | from IPython import embed | ||
+ | from torch.utils.data import Dataset | ||
+ | from torch.utils.data.dataloader import DataLoader | ||
+ | |||
+ | |||
+ | def parse_args(): | ||
+ | parser = argparse.ArgumentParser() | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | return parser.parse_args() | ||
+ | |||
+ | args = parse_args() | ||
+ | |||
+ | |||
+ | class CausalSelfAttention(nn.Module): | ||
+ | """ | ||
+ | https:// | ||
+ | """ | ||
+ | |||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | assert d_model % n_head == 0 | ||
+ | # key, query, value projections for all heads | ||
+ | self.key = nn.Linear(d_model, | ||
+ | self.query = nn.Linear(d_model, | ||
+ | self.value = nn.Linear(d_model, | ||
+ | # regularization | ||
+ | self.attn_drop = nn.Dropout(dropout) | ||
+ | self.resid_drop = nn.Dropout(dropout) | ||
+ | # output projection | ||
+ | self.proj = nn.Linear(d_model, | ||
+ | # causal mask to ensure that attention is only applied to the left in the input sequence | ||
+ | self.register_buffer( | ||
+ | " | ||
+ | torch.tril(torch.ones(block_size, | ||
+ | ) | ||
+ | self.n_head = n_head | ||
+ | |||
+ | def forward(self, | ||
+ | B, T, C = x.size() | ||
+ | |||
+ | # calculate query, key, values for all heads in batch and move head forward to be the batch dim | ||
+ | k = self.key(x).view(B, | ||
+ | q = self.query(x).view(B, | ||
+ | v = self.value(x).view(B, | ||
+ | |||
+ | # causal self-attention; | ||
+ | att = (q @ k.transpose(-2, | ||
+ | att = att.masked_fill(self.mask[:,:,: | ||
+ | att = F.softmax(att, | ||
+ | att = self.attn_drop(att) | ||
+ | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) | ||
+ | y = y.transpose(1, | ||
+ | |||
+ | # output projection | ||
+ | y = self.resid_drop(self.proj(y)) | ||
+ | return y | ||
+ | |||
+ | class Block(nn.Module): | ||
+ | """ | ||
+ | |||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | self.ln1 = nn.LayerNorm(d_model) | ||
+ | self.ln2 = nn.LayerNorm(d_model) | ||
+ | self.attn = CausalSelfAttention(d_model, | ||
+ | self.mlp = nn.Sequential( | ||
+ | nn.Linear(d_model, | ||
+ | nn.GELU(), | ||
+ | nn.Linear(4 * d_model, d_model), | ||
+ | nn.Dropout(dropout), | ||
+ | ) | ||
+ | |||
+ | def forward(self, | ||
+ | x = x + self.attn(self.ln1(x)) | ||
+ | x = x + self.mlp(self.ln2(x)) | ||
+ | return x | ||
+ | |||
+ | |||
+ | class GPTModel(nn.Module): | ||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | self.n_layers = 6 | ||
+ | self.n_heads = 8 | ||
+ | self.d_model = 512 | ||
+ | self.block_size = block_size | ||
+ | |||
+ | self.we = nn.Linear(input_dims, | ||
+ | self.wp = nn.Parameter(torch.zeros(1, | ||
+ | self.blocks = nn.Sequential(*[ | ||
+ | Block(self.d_model, | ||
+ | for _ in range(self.n_layers) | ||
+ | ]) | ||
+ | self.norm = nn.LayerNorm(self.d_model) | ||
+ | self.wd = nn.Linear(self.d_model, | ||
+ | |||
+ | self.apply(self._init_weights) | ||
+ | print(f' | ||
+ | |||
+ | def _init_weights(self, | ||
+ | if isinstance(module, | ||
+ | module.weight.data.normal_(mean=0.0, | ||
+ | if isinstance(module, | ||
+ | module.bias.data.zero_() | ||
+ | elif isinstance(module, | ||
+ | module.bias.data.zero_() | ||
+ | module.weight.data.fill_(1.0) | ||
+ | |||
+ | def forward(self, | ||
+ | B, T, C = src.size() | ||
+ | src_embed = self.we(src) | ||
+ | pos_embed = self.wp[:, :T, :] | ||
+ | hx = src_embed + pos_embed | ||
+ | hx = self.blocks(hx) | ||
+ | hx = self.norm(hx) | ||
+ | out = self.wd(self.norm(hx)) | ||
+ | src = torch.cat([src[:, | ||
+ | return out, src | ||
+ | |||
+ | |||
+ | class BasicDataset(Dataset): | ||
+ | |||
+ | def __init__(self, | ||
+ | self.block_size = block_size | ||
+ | |||
+ | self.data = np.sin(np.arange(10240) / 10.) | ||
+ | # self.data = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 | ||
+ | # self.data = np.abs(np.sin(np.arange(10240) / 10.)) | ||
+ | # data = np.sin(np.arange(10240) / 10.) * (np.sin(np.arange(10240) / 10.) > 0.0) | ||
+ | self.data = self.data.astype(np.float32) | ||
+ | self.data = self.data.reshape(-1, | ||
+ | self.data_std = self.data.std(0) | ||
+ | self.repeat = repeat | ||
+ | self.noise_scale = noise_scale | ||
+ | | ||
+ | def __len__(self): | ||
+ | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
+ | return len(self.data) * self.repeat | ||
+ | |||
+ | def __getitem__(self, | ||
+ | # we're actually going to " | ||
+ | i = np.random.randint(0, | ||
+ | chunk = self.data[i: | ||
+ | chunk += np.random.normal(0, | ||
+ | x = torch.tensor(chunk[: | ||
+ | y = torch.tensor(chunk[1: | ||
+ | return x, y | ||
+ | |||
+ | def get_test_data(self, | ||
+ | i = np.random.randint(0, | ||
+ | idx = np.arange(i, | ||
+ | data = self.data[idx].reshape(1, | ||
+ | tgt = torch.tensor(data, | ||
+ | src = tgt[:, : | ||
+ | gen = tgt[:, : | ||
+ | return tgt, src, gen | ||
+ | |||
+ | |||
+ | class MotionDataset(Dataset): | ||
+ | | ||
+ | def __init__(self, | ||
+ | self.block_size = block_size | ||
+ | |||
+ | import urllib, json | ||
+ | url = " | ||
+ | self.data = json.loads(urllib.request.urlopen(url).read())[' | ||
+ | self.data = np.array(self.data, | ||
+ | self.data = np.hstack([self.data[:, | ||
+ | self.data = np.tile(self.data, | ||
+ | self.dims = self.data.shape[-1] | ||
+ | self.data_mean = self.data.mean(0, | ||
+ | self.data_std = self.data.std(0, | ||
+ | self.data = (self.data - self.data_mean) / self.data_std | ||
+ | |||
+ | self.data = self.data.astype(np.float32) | ||
+ | self.repeat = repeat | ||
+ | self.noise_scale = noise_scale | ||
+ | | ||
+ | def __len__(self): | ||
+ | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
+ | return len(self.data) * self.repeat | ||
+ | |||
+ | def __getitem__(self, | ||
+ | # we're actually going to " | ||
+ | i = np.random.randint(0, | ||
+ | chunk = self.data[i: | ||
+ | chunk += np.random.normal(0, | ||
+ | x = torch.tensor(chunk[: | ||
+ | y = torch.tensor(chunk[1: | ||
+ | return x, y | ||
+ | |||
+ | def get_test_data(self, | ||
+ | i = np.random.randint(0, | ||
+ | idx = np.arange(i, | ||
+ | data = self.data[idx].reshape(1, | ||
+ | tgt = torch.tensor(data, | ||
+ | src = tgt[:, : | ||
+ | gen = tgt[:, : | ||
+ | return tgt, src, gen | ||
+ | |||
+ | |||
+ | if __name__ == ' | ||
+ | |||
+ | # create the dataloader | ||
+ | Dataset = globals()[args.dataset] | ||
+ | dataset = Dataset(args.block_size, | ||
+ | loader = DataLoader(dataset, | ||
+ | | ||
+ | # create the model | ||
+ | dim = dataset.data.shape[-1] | ||
+ | model = GPTModel(dim, | ||
+ | |||
+ | # create the optimizer | ||
+ | no_decay = [" | ||
+ | params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] | ||
+ | params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] | ||
+ | optim_groups = [ | ||
+ | {" | ||
+ | {" | ||
+ | ] | ||
+ | optimizer = optim.AdamW(optim_groups, | ||
+ | |||
+ | def warmup_cosine(optimizer, | ||
+ | s = float(epoch <= warmup) | ||
+ | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
+ | for param_group in optimizer.param_groups: | ||
+ | param_group[' | ||
+ | |||
+ | step = 0 | ||
+ | train_loss_list = list() | ||
+ | test_score_list = list() | ||
+ | | ||
+ | for epoch in tqdm.trange(args.max_epoch): | ||
+ | # fitting | ||
+ | model.train() | ||
+ | for i, (src, tgt) in tqdm.tqdm(enumerate(loader), | ||
+ | src, tgt = src.to(args.device), | ||
+ | |||
+ | gen, _ = model(src) | ||
+ | |||
+ | optimizer.zero_grad() | ||
+ | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
+ | loss.backward() | ||
+ | nn.utils.clip_grad_norm_(model.parameters(), | ||
+ | optimizer.step() | ||
+ | warmup_cosine(optimizer, | ||
+ | |||
+ | step += 1 / len(loader) | ||
+ | train_loss_list.append((step, | ||
+ | |||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
+ | |||
+ | # eval | ||
+ | model.eval() | ||
+ | tgt, src, gen = dataset.get_test_data(args.test_steps, | ||
+ | |||
+ | with torch.no_grad(): | ||
+ | for i in range(args.test_steps - args.block_size): | ||
+ | gen_, src = model(src) | ||
+ | gen = torch.cat([gen, | ||
+ | | ||
+ | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
+ | score = (-loss).exp() | ||
+ | test_score_list.append((step, | ||
+ | |||
+ | mlab.plot(tgt.cpu().numpy()[0, | ||
+ | mlab.oplot(gen.cpu().numpy()[0, | ||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*test_score_list[-1000: | ||
+ | tqdm.tqdm.write(str(args)) | ||
+ | |||
+ | embed() | ||
+ | |||
+ | </ | ||
+ | |||
+ | ===== V1 ===== | ||
<code python gpt.py> | <code python gpt.py> | ||
줄 17: | 줄 319: | ||
from gr.pygr import mlab | from gr.pygr import mlab | ||
from IPython import embed | from IPython import embed | ||
+ | from traitlets.config.loader import ArgumentParser | ||
+ | |||
+ | |||
+ | def parse_args(): | ||
+ | parser = ArgumentParser() | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | return parser.parse_args() | ||
+ | |||
+ | args = parse_args() | ||
줄 22: | 줄 336: | ||
def __init__(self, | def __init__(self, | ||
super().__init__() | super().__init__() | ||
- | self.temperature | + | self.scale = np.power(key_dim, |
self.n_heads = num_heads | self.n_heads = num_heads | ||
self.dropout = nn.Dropout(drop) | self.dropout = nn.Dropout(drop) | ||
줄 28: | 줄 342: | ||
def forward(self, | def forward(self, | ||
q = self.split_heads(q) | q = self.split_heads(q) | ||
- | k = self.split_heads(k) | + | k = self.split_heads(k, key=True) |
v = self.split_heads(v) | v = self.split_heads(v) | ||
- | | + | |
- | | + | w = w / self.scale |
- | attn = F.softmax(energy, dim=2) | + | |
- | context = torch.bmm(attn, v) | + | attn = F.softmax(w, dim=-1) |
+ | attn = self.dropout(attn) | ||
+ | context = torch.matmul(attn, v) | ||
context = self.merge_heads(context) | context = self.merge_heads(context) | ||
- | context = self.dropout(context) | ||
return context, attn | return context, attn | ||
- | def split_heads(self, | + | def split_heads(self, |
seq, bs, emb = x.size() | seq, bs, emb = x.size() | ||
d_k = emb // self.n_heads | d_k = emb // self.n_heads | ||
x = x.view(seq, bs, self.n_heads, | x = x.view(seq, bs, self.n_heads, | ||
- | x = x.permute(1, | + | |
- | | + | # bs, self.n_heads, |
+ | | ||
+ | | ||
+ | # bs, self.n_heads, | ||
+ | x = x.permute(1, | ||
return x | return x | ||
def merge_heads(self, | def merge_heads(self, | ||
- | | + | |
- | bs = bs_heads // self.n_heads | + | |
- | x = x.view(bs, self.n_heads, | + | |
x = x.permute(2, | x = x.permute(2, | ||
x = x.reshape(seq, | x = x.reshape(seq, | ||
줄 56: | 줄 373: | ||
class MHA(nn.Module): | class MHA(nn.Module): | ||
- | def __init__(self, | + | def __init__(self, |
super().__init__() | super().__init__() | ||
self.n_heads = num_heads | self.n_heads = num_heads | ||
- | self.attn = MultiheadAttention(embed_dim, | + | self.qkv = nn.Linear(embed_dim, |
- | | + | |
- | + | | |
- | | + | |
- | self.key = nn.Linear(embed_dim, | + | self.attn = nn.MultiheadAttention(embed_dim, |
- | self.value = nn.Linear(embed_dim, embed_dim) | + | |
self.out = nn.Linear(embed_dim, | self.out = nn.Linear(embed_dim, | ||
- | layers = (self.query, self.key, self.value, self.out) | + | layers = (self.qkv, self.out) |
for layer in layers: | for layer in layers: | ||
- | torch.nn.init.normal_(layer.weight, std=0.02) | + | torch.nn.init.xavier_uniform_(layer.weight) |
- | torch.nn.init.uniform_(layer.bias, -0.001, 0.001) | + | self.out.bias.data.zero_() |
- | def forward(self, | + | def forward(self, |
- | seq = x.size(0) | + | seq, bsz, emb = x.size() |
- | q = self.query(x) | + | q, k, v = self.qkv(x).split(emb, dim=2) |
- | | + | context, |
- | | + | |
- | mask = (torch.tril(torch.ones(seq, seq)) == 0).to(x.device) | + | |
- | context, | + | |
return self.out(context) | return self.out(context) | ||
줄 94: | 줄 407: | ||
def forward(self, | def forward(self, | ||
- | | + | |
- | | + | x = F.gelu(x) |
+ | | ||
+ | return x | ||
- | class Block(nn.Module): | + | class CustomBlock(nn.Module): |
- | def __init__(self, | + | def __init__(self, |
- | super(Block, self).__init__() | + | super().__init__() |
self.ln_1 = nn.LayerNorm(embed_dim) | self.ln_1 = nn.LayerNorm(embed_dim) | ||
- | self.attn = MHA(embed_dim, | + | self.attn = MHA(embed_dim, |
self.ln_2 = nn.LayerNorm(embed_dim) | self.ln_2 = nn.LayerNorm(embed_dim) | ||
self.mlp = MLP(embed_dim) | self.mlp = MLP(embed_dim) | ||
- | def forward(self, | + | def forward(self, |
- | x = x + self.attn(self.ln_1(x)) | + | x = x + self.attn(self.ln_1(x), src_mask) |
x = x + self.mlp(self.ln_2(x)) | x = x + self.mlp(self.ln_2(x)) | ||
return x | return x | ||
+ | |||
+ | |||
+ | class Block(nn.TransformerEncoderLayer): | ||
+ | def __init__(self, | ||
+ | super().__init__(d_model, | ||
+ | self.activation = F.gelu | ||
+ | |||
+ | def forward(self, | ||
+ | # MHA | ||
+ | x = self.norm1(src) | ||
+ | x = self.self_attn(x, | ||
+ | src = src + self.dropout1(x) | ||
+ | # MLP | ||
+ | x = self.linear2(self.dropout(self.activation(self.linear1(self.norm2(src))))) | ||
+ | src = src + self.dropout2(x) | ||
+ | return src | ||
class GPTModel(nn.Module): | class GPTModel(nn.Module): | ||
- | def __init__(self, | + | def __init__(self, |
super().__init__() | super().__init__() | ||
self.n_layers = 3 | self.n_layers = 3 | ||
self.n_heads = 16 | self.n_heads = 16 | ||
self.d_model = 512 | self.d_model = 512 | ||
- | self.max_len = 32 | + | self.max_len = max_len |
self.we = nn.Linear(input_dims, | self.we = nn.Linear(input_dims, | ||
self.wp = nn.Embedding(self.max_len, | self.wp = nn.Embedding(self.max_len, | ||
- | self.blocks = nn.ModuleList([Block(self.d_model, | + | |
+ | | ||
+ | CustomBlock(self.d_model, | ||
+ | ]) | ||
+ | else: | ||
+ | self.blocks = nn.ModuleList([ | ||
+ | | ||
+ | | ||
self.norm = nn.LayerNorm(self.d_model) | self.norm = nn.LayerNorm(self.d_model) | ||
- | self.wd = nn.Linear(self.d_model, | + | self.wd = nn.Linear(self.d_model, |
torch.nn.init.normal_(self.we.weight, | torch.nn.init.normal_(self.we.weight, | ||
torch.nn.init.uniform_(self.wp.weight, | torch.nn.init.uniform_(self.wp.weight, | ||
torch.nn.init.normal_(self.wd.weight, | torch.nn.init.normal_(self.wd.weight, | ||
- | torch.nn.init.normal_(self.wd.bias, | ||
def forward(self, | def forward(self, | ||
- | seq_len, mb, _ = src.size() | ||
- | |||
src_embed = self.we(src) | src_embed = self.we(src) | ||
- | | + | |
+ | pos_embed = self.wp(pos_idx).unsqueeze(1) | ||
hx = src_embed + pos_embed | hx = src_embed + pos_embed | ||
+ | src_mask = self.generate_src_mask(src.size(0), | ||
for block in self.blocks: | for block in self.blocks: | ||
- | hx = block(hx) | + | hx = block(hx, src_mask=src_mask) |
hx = self.norm(hx) | hx = self.norm(hx) | ||
- | | + | out = self.wd(self.norm(hx)) |
- | # out = self.wd(self.norm(hx)) | + | |
src = torch.cat([src[1: | src = torch.cat([src[1: | ||
return out, src | return out, src | ||
+ | |||
+ | @staticmethod | ||
+ | def generate_src_mask(size, | ||
+ | mask = (torch.triu(torch.ones(size, | ||
+ | mask = mask.float().to(device) | ||
+ | mask = mask.masked_fill(mask == 0, float(' | ||
+ | return mask | ||
줄 152: | 줄 495: | ||
n_epochs = 2500 | n_epochs = 2500 | ||
- | | + | |
- | | + | |
- | | + | |
- | | + | |
device = ' | device = ' | ||
- | dataset = np.sin(np.arange(1024) / 10.) | + | dataset = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 |
| | ||
- | model = GPTModel(1, 1).to(device) | + | model = GPTModel(1, 1, prev_steps + next_steps).to(device) |
- | optimizer = optim.Adam( | + | optimizer = optim.Adam(model.parameters(), |
- | | + | |
- | ) | + | |
+ | def warmup_cosine(optimizer, | ||
+ | s = float(epoch <= warmup) | ||
+ | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
+ | for param_group in optimizer.param_groups: | ||
+ | param_group[' | ||
step = 0 | step = 0 | ||
- | | + | |
+ | test_loss_list | ||
| | ||
- | for _ in tqdm.trange(n_epochs): | + | for epoch in tqdm.trange(n_epochs): |
- | bid = np.random.randint( | + | # make batch id |
- | 0, len(dataset)-(prev_steps + next_steps), (len(dataset) // mb, mb) | + | bid = np.arange(len(dataset)-(prev_steps + next_steps)) |
- | | + | np.random.shuffle(bid) |
+ | bid = bid[:len(bid) // bsz * bsz] | ||
+ | | ||
pos = np.arange(prev_steps + next_steps).reshape(1, | pos = np.arange(prev_steps + next_steps).reshape(1, | ||
- | idxes = bid + pos | + | idxes = bid + pos # mini-batch x seq x data-index |
- | for idx in idxes: | + | |
- | data = dataset[idx].reshape((prev_steps + next_steps, | + | |
- | | + | data = dataset[idx].reshape((prev_steps + next_steps, |
- | src, tgt = data[: | + | |
- | gen = torch.empty(0, | + | |
- | | + | dtype=torch.float32, |
- | gen_, src = model(src) | + | ) |
- | gen = torch.cat([gen, | + | gen, _ = model(tgt) |
optimizer.zero_grad() | optimizer.zero_grad() | ||
- | loss = (0.5 * (tgt - gen) ** 2).mean() | + | loss = (0.5 * (tgt[1:] - gen[:-1]) ** 2).mean() |
loss.backward() | loss.backward() | ||
optimizer.step() | optimizer.step() | ||
+ | # scheduler.step(epoch + i / len(idxes)) | ||
+ | warmup_cosine(optimizer, | ||
step += 1 / len(idxes) | step += 1 / len(idxes) | ||
- | | + | |
- | | + | |
- | | + | idx = np.random.randint(0, len(dataset)-(prev_steps + test_steps), |
- | torch.cat([data[: | + | idx = idx + np.arange(prev_steps + test_steps).reshape(-1, |
- | | + | |
+ | tgt = torch.tensor( | ||
+ | | ||
+ | | ||
) | ) | ||
- | | + | |
+ | gen = tgt[: | ||
- | embed() | + | with torch.no_grad(): |
+ | for _ in range(test_steps): | ||
+ | gen_, src = model(src) | ||
+ | gen = torch.cat([gen, | ||
+ | |||
+ | mlab.plot(data.reshape(-1)) | ||
+ | mlab.oplot(gen.squeeze_().cpu().numpy()) | ||
+ | loss = (0.5 * (data.reshape(-1) - gen.squeeze_().cpu().numpy()) ** 2).mean() | ||
+ | test_loss_list.append((step, | ||
+ | |||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*test_loss_list[-1000: | ||
+ | tqdm.tqdm.write(str(args)) | ||
+ | |||
+ | embed() | ||
</ | </ | ||
- | {{tag> | + | {{tag> |
code/gpt_example.1596046680.txt.gz · 마지막으로 수정됨: (바깥 편집)