code:gpt_example
차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
| 양쪽 이전 판이전 판다음 판 | 이전 판 | ||
| code:gpt_example [2020/07/29 18:17] – ↷ 문서 이름이 code:gpt에서 code:gpt_example(으)로 바뀌었습니다 rex8312 | code:gpt_example [2024/03/23 02:42] (현재) – 바깥 편집 127.0.0.1 | ||
|---|---|---|---|
| 줄 1: | 줄 1: | ||
| - | ====== GPT ====== | + | ====== |
| * 참고 | * 참고 | ||
| + | * https:// | ||
| + | * https:// | ||
| * https:// | * https:// | ||
| * https:// | * https:// | ||
| * https:// | * https:// | ||
| + | ===== V2 ===== | ||
| + | <code python gpt_v2.py> | ||
| + | |||
| + | import argparse | ||
| + | import math | ||
| + | |||
| + | import numpy as np | ||
| + | import plotille | ||
| + | import torch | ||
| + | import torch.nn as nn | ||
| + | import torch.nn.functional as F | ||
| + | import torch.optim as optim | ||
| + | import tqdm | ||
| + | from gr.pygr import mlab | ||
| + | from IPython import embed | ||
| + | from torch.utils.data import Dataset | ||
| + | from torch.utils.data.dataloader import DataLoader | ||
| + | |||
| + | |||
| + | def parse_args(): | ||
| + | parser = argparse.ArgumentParser() | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | return parser.parse_args() | ||
| + | |||
| + | args = parse_args() | ||
| + | |||
| + | |||
| + | class CausalSelfAttention(nn.Module): | ||
| + | """ | ||
| + | https:// | ||
| + | """ | ||
| + | |||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | assert d_model % n_head == 0 | ||
| + | # key, query, value projections for all heads | ||
| + | self.key = nn.Linear(d_model, | ||
| + | self.query = nn.Linear(d_model, | ||
| + | self.value = nn.Linear(d_model, | ||
| + | # regularization | ||
| + | self.attn_drop = nn.Dropout(dropout) | ||
| + | self.resid_drop = nn.Dropout(dropout) | ||
| + | # output projection | ||
| + | self.proj = nn.Linear(d_model, | ||
| + | # causal mask to ensure that attention is only applied to the left in the input sequence | ||
| + | self.register_buffer( | ||
| + | " | ||
| + | torch.tril(torch.ones(block_size, | ||
| + | ) | ||
| + | self.n_head = n_head | ||
| + | |||
| + | def forward(self, | ||
| + | B, T, C = x.size() | ||
| + | |||
| + | # calculate query, key, values for all heads in batch and move head forward to be the batch dim | ||
| + | k = self.key(x).view(B, | ||
| + | q = self.query(x).view(B, | ||
| + | v = self.value(x).view(B, | ||
| + | |||
| + | # causal self-attention; | ||
| + | att = (q @ k.transpose(-2, | ||
| + | att = att.masked_fill(self.mask[:,:,: | ||
| + | att = F.softmax(att, | ||
| + | att = self.attn_drop(att) | ||
| + | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) | ||
| + | y = y.transpose(1, | ||
| + | |||
| + | # output projection | ||
| + | y = self.resid_drop(self.proj(y)) | ||
| + | return y | ||
| + | |||
| + | class Block(nn.Module): | ||
| + | """ | ||
| + | |||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | self.ln1 = nn.LayerNorm(d_model) | ||
| + | self.ln2 = nn.LayerNorm(d_model) | ||
| + | self.attn = CausalSelfAttention(d_model, | ||
| + | self.mlp = nn.Sequential( | ||
| + | nn.Linear(d_model, | ||
| + | nn.GELU(), | ||
| + | nn.Linear(4 * d_model, d_model), | ||
| + | nn.Dropout(dropout), | ||
| + | ) | ||
| + | |||
| + | def forward(self, | ||
| + | x = x + self.attn(self.ln1(x)) | ||
| + | x = x + self.mlp(self.ln2(x)) | ||
| + | return x | ||
| + | |||
| + | |||
| + | class GPTModel(nn.Module): | ||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | self.n_layers = 6 | ||
| + | self.n_heads = 8 | ||
| + | self.d_model = 512 | ||
| + | self.block_size = block_size | ||
| + | |||
| + | self.we = nn.Linear(input_dims, | ||
| + | self.wp = nn.Parameter(torch.zeros(1, | ||
| + | self.blocks = nn.Sequential(*[ | ||
| + | Block(self.d_model, | ||
| + | for _ in range(self.n_layers) | ||
| + | ]) | ||
| + | self.norm = nn.LayerNorm(self.d_model) | ||
| + | self.wd = nn.Linear(self.d_model, | ||
| + | |||
| + | self.apply(self._init_weights) | ||
| + | print(f' | ||
| + | |||
| + | def _init_weights(self, | ||
| + | if isinstance(module, | ||
| + | module.weight.data.normal_(mean=0.0, | ||
| + | if isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | elif isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | module.weight.data.fill_(1.0) | ||
| + | |||
| + | def forward(self, | ||
| + | B, T, C = src.size() | ||
| + | src_embed = self.we(src) | ||
| + | pos_embed = self.wp[:, :T, :] | ||
| + | hx = src_embed + pos_embed | ||
| + | hx = self.blocks(hx) | ||
| + | hx = self.norm(hx) | ||
| + | out = self.wd(self.norm(hx)) | ||
| + | src = torch.cat([src[:, | ||
| + | return out, src | ||
| + | |||
| + | |||
| + | class BasicDataset(Dataset): | ||
| + | |||
| + | def __init__(self, | ||
| + | self.block_size = block_size | ||
| + | |||
| + | self.data = np.sin(np.arange(10240) / 10.) | ||
| + | # self.data = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 | ||
| + | # self.data = np.abs(np.sin(np.arange(10240) / 10.)) | ||
| + | # data = np.sin(np.arange(10240) / 10.) * (np.sin(np.arange(10240) / 10.) > 0.0) | ||
| + | self.data = self.data.astype(np.float32) | ||
| + | self.data = self.data.reshape(-1, | ||
| + | self.data_std = self.data.std(0) | ||
| + | self.repeat = repeat | ||
| + | self.noise_scale = noise_scale | ||
| + | | ||
| + | def __len__(self): | ||
| + | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
| + | return len(self.data) * self.repeat | ||
| + | |||
| + | def __getitem__(self, | ||
| + | # we're actually going to " | ||
| + | i = np.random.randint(0, | ||
| + | chunk = self.data[i: | ||
| + | chunk += np.random.normal(0, | ||
| + | x = torch.tensor(chunk[: | ||
| + | y = torch.tensor(chunk[1: | ||
| + | return x, y | ||
| + | |||
| + | def get_test_data(self, | ||
| + | i = np.random.randint(0, | ||
| + | idx = np.arange(i, | ||
| + | data = self.data[idx].reshape(1, | ||
| + | tgt = torch.tensor(data, | ||
| + | src = tgt[:, : | ||
| + | gen = tgt[:, : | ||
| + | return tgt, src, gen | ||
| + | |||
| + | |||
| + | class MotionDataset(Dataset): | ||
| + | | ||
| + | def __init__(self, | ||
| + | self.block_size = block_size | ||
| + | |||
| + | import urllib, json | ||
| + | url = " | ||
| + | self.data = json.loads(urllib.request.urlopen(url).read())[' | ||
| + | self.data = np.array(self.data, | ||
| + | self.data = np.hstack([self.data[:, | ||
| + | self.data = np.tile(self.data, | ||
| + | self.dims = self.data.shape[-1] | ||
| + | self.data_mean = self.data.mean(0, | ||
| + | self.data_std = self.data.std(0, | ||
| + | self.data = (self.data - self.data_mean) / self.data_std | ||
| + | |||
| + | self.data = self.data.astype(np.float32) | ||
| + | self.repeat = repeat | ||
| + | self.noise_scale = noise_scale | ||
| + | | ||
| + | def __len__(self): | ||
| + | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
| + | return len(self.data) * self.repeat | ||
| + | |||
| + | def __getitem__(self, | ||
| + | # we're actually going to " | ||
| + | i = np.random.randint(0, | ||
| + | chunk = self.data[i: | ||
| + | chunk += np.random.normal(0, | ||
| + | x = torch.tensor(chunk[: | ||
| + | y = torch.tensor(chunk[1: | ||
| + | return x, y | ||
| + | |||
| + | def get_test_data(self, | ||
| + | i = np.random.randint(0, | ||
| + | idx = np.arange(i, | ||
| + | data = self.data[idx].reshape(1, | ||
| + | tgt = torch.tensor(data, | ||
| + | src = tgt[:, : | ||
| + | gen = tgt[:, : | ||
| + | return tgt, src, gen | ||
| + | |||
| + | |||
| + | if __name__ == ' | ||
| + | |||
| + | # create the dataloader | ||
| + | Dataset = globals()[args.dataset] | ||
| + | dataset = Dataset(args.block_size, | ||
| + | loader = DataLoader(dataset, | ||
| + | | ||
| + | # create the model | ||
| + | dim = dataset.data.shape[-1] | ||
| + | model = GPTModel(dim, | ||
| + | |||
| + | # create the optimizer | ||
| + | no_decay = [" | ||
| + | params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] | ||
| + | params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] | ||
| + | optim_groups = [ | ||
| + | {" | ||
| + | {" | ||
| + | ] | ||
| + | optimizer = optim.AdamW(optim_groups, | ||
| + | |||
| + | def warmup_cosine(optimizer, | ||
| + | s = float(epoch <= warmup) | ||
| + | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
| + | for param_group in optimizer.param_groups: | ||
| + | param_group[' | ||
| + | |||
| + | step = 0 | ||
| + | train_loss_list = list() | ||
| + | test_score_list = list() | ||
| + | | ||
| + | for epoch in tqdm.trange(args.max_epoch): | ||
| + | # fitting | ||
| + | model.train() | ||
| + | for i, (src, tgt) in tqdm.tqdm(enumerate(loader), | ||
| + | src, tgt = src.to(args.device), | ||
| + | |||
| + | gen, _ = model(src) | ||
| + | |||
| + | optimizer.zero_grad() | ||
| + | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
| + | loss.backward() | ||
| + | nn.utils.clip_grad_norm_(model.parameters(), | ||
| + | optimizer.step() | ||
| + | warmup_cosine(optimizer, | ||
| + | |||
| + | step += 1 / len(loader) | ||
| + | train_loss_list.append((step, | ||
| + | |||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
| + | |||
| + | # eval | ||
| + | model.eval() | ||
| + | tgt, src, gen = dataset.get_test_data(args.test_steps, | ||
| + | |||
| + | with torch.no_grad(): | ||
| + | for i in range(args.test_steps - args.block_size): | ||
| + | gen_, src = model(src) | ||
| + | gen = torch.cat([gen, | ||
| + | | ||
| + | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
| + | score = (-loss).exp() | ||
| + | test_score_list.append((step, | ||
| + | |||
| + | mlab.plot(tgt.cpu().numpy()[0, | ||
| + | mlab.oplot(gen.cpu().numpy()[0, | ||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*test_score_list[-1000: | ||
| + | tqdm.tqdm.write(str(args)) | ||
| + | |||
| + | embed() | ||
| + | |||
| + | </ | ||
| + | |||
| + | ===== V1 ===== | ||
| <code python gpt.py> | <code python gpt.py> | ||
| 줄 17: | 줄 319: | ||
| from gr.pygr import mlab | from gr.pygr import mlab | ||
| from IPython import embed | from IPython import embed | ||
| + | from traitlets.config.loader import ArgumentParser | ||
| + | |||
| + | |||
| + | def parse_args(): | ||
| + | parser = ArgumentParser() | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | return parser.parse_args() | ||
| + | |||
| + | args = parse_args() | ||
| 줄 22: | 줄 336: | ||
| def __init__(self, | def __init__(self, | ||
| super().__init__() | super().__init__() | ||
| - | self.temperature | + | self.scale = np.power(key_dim, |
| self.n_heads = num_heads | self.n_heads = num_heads | ||
| self.dropout = nn.Dropout(drop) | self.dropout = nn.Dropout(drop) | ||
| 줄 28: | 줄 342: | ||
| def forward(self, | def forward(self, | ||
| q = self.split_heads(q) | q = self.split_heads(q) | ||
| - | k = self.split_heads(k) | + | k = self.split_heads(k, key=True) |
| v = self.split_heads(v) | v = self.split_heads(v) | ||
| - | | + | |
| - | | + | w = w / self.scale |
| - | attn = F.softmax(energy, dim=2) | + | |
| - | context = torch.bmm(attn, v) | + | attn = F.softmax(w, dim=-1) |
| + | attn = self.dropout(attn) | ||
| + | context = torch.matmul(attn, v) | ||
| context = self.merge_heads(context) | context = self.merge_heads(context) | ||
| - | context = self.dropout(context) | ||
| return context, attn | return context, attn | ||
| - | def split_heads(self, | + | def split_heads(self, |
| seq, bs, emb = x.size() | seq, bs, emb = x.size() | ||
| d_k = emb // self.n_heads | d_k = emb // self.n_heads | ||
| x = x.view(seq, bs, self.n_heads, | x = x.view(seq, bs, self.n_heads, | ||
| - | x = x.permute(1, | + | |
| - | | + | # bs, self.n_heads, |
| + | | ||
| + | | ||
| + | # bs, self.n_heads, | ||
| + | x = x.permute(1, | ||
| return x | return x | ||
| def merge_heads(self, | def merge_heads(self, | ||
| - | | + | |
| - | bs = bs_heads // self.n_heads | + | |
| - | x = x.view(bs, self.n_heads, | + | |
| x = x.permute(2, | x = x.permute(2, | ||
| x = x.reshape(seq, | x = x.reshape(seq, | ||
| 줄 56: | 줄 373: | ||
| class MHA(nn.Module): | class MHA(nn.Module): | ||
| - | def __init__(self, | + | def __init__(self, |
| super().__init__() | super().__init__() | ||
| self.n_heads = num_heads | self.n_heads = num_heads | ||
| - | self.attn = MultiheadAttention(embed_dim, | + | self.qkv = nn.Linear(embed_dim, |
| - | | + | |
| - | + | | |
| - | | + | |
| - | self.key = nn.Linear(embed_dim, | + | self.attn = nn.MultiheadAttention(embed_dim, |
| - | self.value = nn.Linear(embed_dim, embed_dim) | + | |
| self.out = nn.Linear(embed_dim, | self.out = nn.Linear(embed_dim, | ||
| - | layers = (self.query, self.key, self.value, self.out) | + | layers = (self.qkv, self.out) |
| for layer in layers: | for layer in layers: | ||
| - | torch.nn.init.normal_(layer.weight, std=0.02) | + | torch.nn.init.xavier_uniform_(layer.weight) |
| - | torch.nn.init.uniform_(layer.bias, -0.001, 0.001) | + | self.out.bias.data.zero_() |
| - | def forward(self, | + | def forward(self, |
| - | seq = x.size(0) | + | seq, bsz, emb = x.size() |
| - | q = self.query(x) | + | q, k, v = self.qkv(x).split(emb, dim=2) |
| - | | + | context, |
| - | | + | |
| - | mask = (torch.tril(torch.ones(seq, seq)) == 0).to(x.device) | + | |
| - | context, | + | |
| return self.out(context) | return self.out(context) | ||
| 줄 94: | 줄 407: | ||
| def forward(self, | def forward(self, | ||
| - | | + | |
| - | | + | x = F.gelu(x) |
| + | | ||
| + | return x | ||
| - | class Block(nn.Module): | + | class CustomBlock(nn.Module): |
| - | def __init__(self, | + | def __init__(self, |
| - | super(Block, self).__init__() | + | super().__init__() |
| self.ln_1 = nn.LayerNorm(embed_dim) | self.ln_1 = nn.LayerNorm(embed_dim) | ||
| - | self.attn = MHA(embed_dim, | + | self.attn = MHA(embed_dim, |
| self.ln_2 = nn.LayerNorm(embed_dim) | self.ln_2 = nn.LayerNorm(embed_dim) | ||
| self.mlp = MLP(embed_dim) | self.mlp = MLP(embed_dim) | ||
| - | def forward(self, | + | def forward(self, |
| - | x = x + self.attn(self.ln_1(x)) | + | x = x + self.attn(self.ln_1(x), src_mask) |
| x = x + self.mlp(self.ln_2(x)) | x = x + self.mlp(self.ln_2(x)) | ||
| return x | return x | ||
| + | |||
| + | |||
| + | class Block(nn.TransformerEncoderLayer): | ||
| + | def __init__(self, | ||
| + | super().__init__(d_model, | ||
| + | self.activation = F.gelu | ||
| + | |||
| + | def forward(self, | ||
| + | # MHA | ||
| + | x = self.norm1(src) | ||
| + | x = self.self_attn(x, | ||
| + | src = src + self.dropout1(x) | ||
| + | # MLP | ||
| + | x = self.linear2(self.dropout(self.activation(self.linear1(self.norm2(src))))) | ||
| + | src = src + self.dropout2(x) | ||
| + | return src | ||
| class GPTModel(nn.Module): | class GPTModel(nn.Module): | ||
| - | def __init__(self, | + | def __init__(self, |
| super().__init__() | super().__init__() | ||
| self.n_layers = 3 | self.n_layers = 3 | ||
| self.n_heads = 16 | self.n_heads = 16 | ||
| self.d_model = 512 | self.d_model = 512 | ||
| - | self.max_len = 32 | + | self.max_len = max_len |
| self.we = nn.Linear(input_dims, | self.we = nn.Linear(input_dims, | ||
| self.wp = nn.Embedding(self.max_len, | self.wp = nn.Embedding(self.max_len, | ||
| - | self.blocks = nn.ModuleList([Block(self.d_model, | + | |
| + | | ||
| + | CustomBlock(self.d_model, | ||
| + | ]) | ||
| + | else: | ||
| + | self.blocks = nn.ModuleList([ | ||
| + | | ||
| + | | ||
| self.norm = nn.LayerNorm(self.d_model) | self.norm = nn.LayerNorm(self.d_model) | ||
| - | self.wd = nn.Linear(self.d_model, | + | self.wd = nn.Linear(self.d_model, |
| torch.nn.init.normal_(self.we.weight, | torch.nn.init.normal_(self.we.weight, | ||
| torch.nn.init.uniform_(self.wp.weight, | torch.nn.init.uniform_(self.wp.weight, | ||
| torch.nn.init.normal_(self.wd.weight, | torch.nn.init.normal_(self.wd.weight, | ||
| - | torch.nn.init.normal_(self.wd.bias, | ||
| def forward(self, | def forward(self, | ||
| - | seq_len, mb, _ = src.size() | ||
| - | |||
| src_embed = self.we(src) | src_embed = self.we(src) | ||
| - | | + | |
| + | pos_embed = self.wp(pos_idx).unsqueeze(1) | ||
| hx = src_embed + pos_embed | hx = src_embed + pos_embed | ||
| + | src_mask = self.generate_src_mask(src.size(0), | ||
| for block in self.blocks: | for block in self.blocks: | ||
| - | hx = block(hx) | + | hx = block(hx, src_mask=src_mask) |
| hx = self.norm(hx) | hx = self.norm(hx) | ||
| - | | + | out = self.wd(self.norm(hx)) |
| - | # out = self.wd(self.norm(hx)) | + | |
| src = torch.cat([src[1: | src = torch.cat([src[1: | ||
| return out, src | return out, src | ||
| + | |||
| + | @staticmethod | ||
| + | def generate_src_mask(size, | ||
| + | mask = (torch.triu(torch.ones(size, | ||
| + | mask = mask.float().to(device) | ||
| + | mask = mask.masked_fill(mask == 0, float(' | ||
| + | return mask | ||
| 줄 152: | 줄 495: | ||
| n_epochs = 2500 | n_epochs = 2500 | ||
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| device = ' | device = ' | ||
| - | dataset = np.sin(np.arange(1024) / 10.) | + | dataset = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 |
| | | ||
| - | model = GPTModel(1, 1).to(device) | + | model = GPTModel(1, 1, prev_steps + next_steps).to(device) |
| - | optimizer = optim.Adam( | + | optimizer = optim.Adam(model.parameters(), |
| - | | + | |
| - | ) | + | |
| + | def warmup_cosine(optimizer, | ||
| + | s = float(epoch <= warmup) | ||
| + | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
| + | for param_group in optimizer.param_groups: | ||
| + | param_group[' | ||
| step = 0 | step = 0 | ||
| - | | + | |
| + | test_loss_list | ||
| | | ||
| - | for _ in tqdm.trange(n_epochs): | + | for epoch in tqdm.trange(n_epochs): |
| - | bid = np.random.randint( | + | # make batch id |
| - | 0, len(dataset)-(prev_steps + next_steps), (len(dataset) // mb, mb) | + | bid = np.arange(len(dataset)-(prev_steps + next_steps)) |
| - | | + | np.random.shuffle(bid) |
| + | bid = bid[:len(bid) // bsz * bsz] | ||
| + | | ||
| pos = np.arange(prev_steps + next_steps).reshape(1, | pos = np.arange(prev_steps + next_steps).reshape(1, | ||
| - | idxes = bid + pos | + | idxes = bid + pos # mini-batch x seq x data-index |
| - | for idx in idxes: | + | |
| - | data = dataset[idx].reshape((prev_steps + next_steps, | + | |
| - | | + | data = dataset[idx].reshape((prev_steps + next_steps, |
| - | src, tgt = data[: | + | |
| - | gen = torch.empty(0, | + | |
| - | | + | dtype=torch.float32, |
| - | gen_, src = model(src) | + | ) |
| - | gen = torch.cat([gen, | + | gen, _ = model(tgt) |
| optimizer.zero_grad() | optimizer.zero_grad() | ||
| - | loss = (0.5 * (tgt - gen) ** 2).mean() | + | loss = (0.5 * (tgt[1:] - gen[:-1]) ** 2).mean() |
| loss.backward() | loss.backward() | ||
| optimizer.step() | optimizer.step() | ||
| + | # scheduler.step(epoch + i / len(idxes)) | ||
| + | warmup_cosine(optimizer, | ||
| step += 1 / len(idxes) | step += 1 / len(idxes) | ||
| - | | + | |
| - | | + | |
| - | | + | idx = np.random.randint(0, len(dataset)-(prev_steps + test_steps), |
| - | torch.cat([data[: | + | idx = idx + np.arange(prev_steps + test_steps).reshape(-1, |
| - | | + | |
| + | tgt = torch.tensor( | ||
| + | | ||
| + | | ||
| ) | ) | ||
| - | | + | |
| + | gen = tgt[: | ||
| - | embed() | + | with torch.no_grad(): |
| + | for _ in range(test_steps): | ||
| + | gen_, src = model(src) | ||
| + | gen = torch.cat([gen, | ||
| + | |||
| + | mlab.plot(data.reshape(-1)) | ||
| + | mlab.oplot(gen.squeeze_().cpu().numpy()) | ||
| + | loss = (0.5 * (data.reshape(-1) - gen.squeeze_().cpu().numpy()) ** 2).mean() | ||
| + | test_loss_list.append((step, | ||
| + | |||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*test_loss_list[-1000: | ||
| + | tqdm.tqdm.write(str(args)) | ||
| + | |||
| + | embed() | ||
| </ | </ | ||
| - | {{tag> | + | {{tag> |
code/gpt_example.1596046654.txt.gz · 마지막으로 수정됨: (바깥 편집)