code:gpt_example
차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
양쪽 이전 판이전 판다음 판 | 이전 판 | ||
code:gpt_example [2020/08/06 14:21] – rex8312 | code:gpt_example [2024/03/23 02:42] (현재) – 바깥 편집 127.0.0.1 | ||
---|---|---|---|
줄 1: | 줄 1: | ||
- | ====== | + | ====== |
* 참고 | * 참고 | ||
+ | * https:// | ||
* https:// | * https:// | ||
* https:// | * https:// | ||
줄 7: | 줄 8: | ||
* https:// | * https:// | ||
+ | ===== V2 ===== | ||
+ | <code python gpt_v2.py> | ||
+ | |||
+ | import argparse | ||
+ | import math | ||
+ | |||
+ | import numpy as np | ||
+ | import plotille | ||
+ | import torch | ||
+ | import torch.nn as nn | ||
+ | import torch.nn.functional as F | ||
+ | import torch.optim as optim | ||
+ | import tqdm | ||
+ | from gr.pygr import mlab | ||
+ | from IPython import embed | ||
+ | from torch.utils.data import Dataset | ||
+ | from torch.utils.data.dataloader import DataLoader | ||
+ | |||
+ | |||
+ | def parse_args(): | ||
+ | parser = argparse.ArgumentParser() | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | parser.add_argument(' | ||
+ | return parser.parse_args() | ||
+ | |||
+ | args = parse_args() | ||
+ | |||
+ | |||
+ | class CausalSelfAttention(nn.Module): | ||
+ | """ | ||
+ | https:// | ||
+ | """ | ||
+ | |||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | assert d_model % n_head == 0 | ||
+ | # key, query, value projections for all heads | ||
+ | self.key = nn.Linear(d_model, | ||
+ | self.query = nn.Linear(d_model, | ||
+ | self.value = nn.Linear(d_model, | ||
+ | # regularization | ||
+ | self.attn_drop = nn.Dropout(dropout) | ||
+ | self.resid_drop = nn.Dropout(dropout) | ||
+ | # output projection | ||
+ | self.proj = nn.Linear(d_model, | ||
+ | # causal mask to ensure that attention is only applied to the left in the input sequence | ||
+ | self.register_buffer( | ||
+ | " | ||
+ | torch.tril(torch.ones(block_size, | ||
+ | ) | ||
+ | self.n_head = n_head | ||
+ | |||
+ | def forward(self, | ||
+ | B, T, C = x.size() | ||
+ | |||
+ | # calculate query, key, values for all heads in batch and move head forward to be the batch dim | ||
+ | k = self.key(x).view(B, | ||
+ | q = self.query(x).view(B, | ||
+ | v = self.value(x).view(B, | ||
+ | |||
+ | # causal self-attention; | ||
+ | att = (q @ k.transpose(-2, | ||
+ | att = att.masked_fill(self.mask[:,:,: | ||
+ | att = F.softmax(att, | ||
+ | att = self.attn_drop(att) | ||
+ | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) | ||
+ | y = y.transpose(1, | ||
+ | |||
+ | # output projection | ||
+ | y = self.resid_drop(self.proj(y)) | ||
+ | return y | ||
+ | |||
+ | class Block(nn.Module): | ||
+ | """ | ||
+ | |||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | self.ln1 = nn.LayerNorm(d_model) | ||
+ | self.ln2 = nn.LayerNorm(d_model) | ||
+ | self.attn = CausalSelfAttention(d_model, | ||
+ | self.mlp = nn.Sequential( | ||
+ | nn.Linear(d_model, | ||
+ | nn.GELU(), | ||
+ | nn.Linear(4 * d_model, d_model), | ||
+ | nn.Dropout(dropout), | ||
+ | ) | ||
+ | |||
+ | def forward(self, | ||
+ | x = x + self.attn(self.ln1(x)) | ||
+ | x = x + self.mlp(self.ln2(x)) | ||
+ | return x | ||
+ | |||
+ | |||
+ | class GPTModel(nn.Module): | ||
+ | def __init__(self, | ||
+ | super().__init__() | ||
+ | self.n_layers = 6 | ||
+ | self.n_heads = 8 | ||
+ | self.d_model = 512 | ||
+ | self.block_size = block_size | ||
+ | |||
+ | self.we = nn.Linear(input_dims, | ||
+ | self.wp = nn.Parameter(torch.zeros(1, | ||
+ | self.blocks = nn.Sequential(*[ | ||
+ | Block(self.d_model, | ||
+ | for _ in range(self.n_layers) | ||
+ | ]) | ||
+ | self.norm = nn.LayerNorm(self.d_model) | ||
+ | self.wd = nn.Linear(self.d_model, | ||
+ | |||
+ | self.apply(self._init_weights) | ||
+ | print(f' | ||
+ | |||
+ | def _init_weights(self, | ||
+ | if isinstance(module, | ||
+ | module.weight.data.normal_(mean=0.0, | ||
+ | if isinstance(module, | ||
+ | module.bias.data.zero_() | ||
+ | elif isinstance(module, | ||
+ | module.bias.data.zero_() | ||
+ | module.weight.data.fill_(1.0) | ||
+ | |||
+ | def forward(self, | ||
+ | B, T, C = src.size() | ||
+ | src_embed = self.we(src) | ||
+ | pos_embed = self.wp[:, :T, :] | ||
+ | hx = src_embed + pos_embed | ||
+ | hx = self.blocks(hx) | ||
+ | hx = self.norm(hx) | ||
+ | out = self.wd(self.norm(hx)) | ||
+ | src = torch.cat([src[:, | ||
+ | return out, src | ||
+ | |||
+ | |||
+ | class BasicDataset(Dataset): | ||
+ | |||
+ | def __init__(self, | ||
+ | self.block_size = block_size | ||
+ | |||
+ | self.data = np.sin(np.arange(10240) / 10.) | ||
+ | # self.data = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 | ||
+ | # self.data = np.abs(np.sin(np.arange(10240) / 10.)) | ||
+ | # data = np.sin(np.arange(10240) / 10.) * (np.sin(np.arange(10240) / 10.) > 0.0) | ||
+ | self.data = self.data.astype(np.float32) | ||
+ | self.data = self.data.reshape(-1, | ||
+ | self.data_std = self.data.std(0) | ||
+ | self.repeat = repeat | ||
+ | self.noise_scale = noise_scale | ||
+ | | ||
+ | def __len__(self): | ||
+ | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
+ | return len(self.data) * self.repeat | ||
+ | |||
+ | def __getitem__(self, | ||
+ | # we're actually going to " | ||
+ | i = np.random.randint(0, | ||
+ | chunk = self.data[i: | ||
+ | chunk += np.random.normal(0, | ||
+ | x = torch.tensor(chunk[: | ||
+ | y = torch.tensor(chunk[1: | ||
+ | return x, y | ||
+ | |||
+ | def get_test_data(self, | ||
+ | i = np.random.randint(0, | ||
+ | idx = np.arange(i, | ||
+ | data = self.data[idx].reshape(1, | ||
+ | tgt = torch.tensor(data, | ||
+ | src = tgt[:, : | ||
+ | gen = tgt[:, : | ||
+ | return tgt, src, gen | ||
+ | |||
+ | |||
+ | class MotionDataset(Dataset): | ||
+ | | ||
+ | def __init__(self, | ||
+ | self.block_size = block_size | ||
+ | |||
+ | import urllib, json | ||
+ | url = " | ||
+ | self.data = json.loads(urllib.request.urlopen(url).read())[' | ||
+ | self.data = np.array(self.data, | ||
+ | self.data = np.hstack([self.data[:, | ||
+ | self.data = np.tile(self.data, | ||
+ | self.dims = self.data.shape[-1] | ||
+ | self.data_mean = self.data.mean(0, | ||
+ | self.data_std = self.data.std(0, | ||
+ | self.data = (self.data - self.data_mean) / self.data_std | ||
+ | |||
+ | self.data = self.data.astype(np.float32) | ||
+ | self.repeat = repeat | ||
+ | self.noise_scale = noise_scale | ||
+ | | ||
+ | def __len__(self): | ||
+ | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
+ | return len(self.data) * self.repeat | ||
+ | |||
+ | def __getitem__(self, | ||
+ | # we're actually going to " | ||
+ | i = np.random.randint(0, | ||
+ | chunk = self.data[i: | ||
+ | chunk += np.random.normal(0, | ||
+ | x = torch.tensor(chunk[: | ||
+ | y = torch.tensor(chunk[1: | ||
+ | return x, y | ||
+ | |||
+ | def get_test_data(self, | ||
+ | i = np.random.randint(0, | ||
+ | idx = np.arange(i, | ||
+ | data = self.data[idx].reshape(1, | ||
+ | tgt = torch.tensor(data, | ||
+ | src = tgt[:, : | ||
+ | gen = tgt[:, : | ||
+ | return tgt, src, gen | ||
+ | |||
+ | |||
+ | if __name__ == ' | ||
+ | |||
+ | # create the dataloader | ||
+ | Dataset = globals()[args.dataset] | ||
+ | dataset = Dataset(args.block_size, | ||
+ | loader = DataLoader(dataset, | ||
+ | | ||
+ | # create the model | ||
+ | dim = dataset.data.shape[-1] | ||
+ | model = GPTModel(dim, | ||
+ | |||
+ | # create the optimizer | ||
+ | no_decay = [" | ||
+ | params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] | ||
+ | params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] | ||
+ | optim_groups = [ | ||
+ | {" | ||
+ | {" | ||
+ | ] | ||
+ | optimizer = optim.AdamW(optim_groups, | ||
+ | |||
+ | def warmup_cosine(optimizer, | ||
+ | s = float(epoch <= warmup) | ||
+ | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
+ | for param_group in optimizer.param_groups: | ||
+ | param_group[' | ||
+ | |||
+ | step = 0 | ||
+ | train_loss_list = list() | ||
+ | test_score_list = list() | ||
+ | | ||
+ | for epoch in tqdm.trange(args.max_epoch): | ||
+ | # fitting | ||
+ | model.train() | ||
+ | for i, (src, tgt) in tqdm.tqdm(enumerate(loader), | ||
+ | src, tgt = src.to(args.device), | ||
+ | |||
+ | gen, _ = model(src) | ||
+ | |||
+ | optimizer.zero_grad() | ||
+ | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
+ | loss.backward() | ||
+ | nn.utils.clip_grad_norm_(model.parameters(), | ||
+ | optimizer.step() | ||
+ | warmup_cosine(optimizer, | ||
+ | |||
+ | step += 1 / len(loader) | ||
+ | train_loss_list.append((step, | ||
+ | |||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
+ | |||
+ | # eval | ||
+ | model.eval() | ||
+ | tgt, src, gen = dataset.get_test_data(args.test_steps, | ||
+ | |||
+ | with torch.no_grad(): | ||
+ | for i in range(args.test_steps - args.block_size): | ||
+ | gen_, src = model(src) | ||
+ | gen = torch.cat([gen, | ||
+ | | ||
+ | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
+ | score = (-loss).exp() | ||
+ | test_score_list.append((step, | ||
+ | |||
+ | mlab.plot(tgt.cpu().numpy()[0, | ||
+ | mlab.oplot(gen.cpu().numpy()[0, | ||
+ | tqdm.tqdm.write(plotille.scatter(*zip(*test_score_list[-1000: | ||
+ | tqdm.tqdm.write(str(args)) | ||
+ | |||
+ | embed() | ||
+ | |||
+ | </ | ||
+ | |||
+ | ===== V1 ===== | ||
<code python gpt.py> | <code python gpt.py> | ||
code/gpt_example.1596723706.txt.gz · 마지막으로 수정됨: (바깥 편집)