code:gpt_example
차이
문서의 선택한 두 판 사이의 차이를 보여줍니다.
| 양쪽 이전 판이전 판다음 판 | 이전 판 | ||
| code:gpt_example [2020/08/04 21:48] – rex8312 | code:gpt_example [2024/03/23 02:42] (현재) – 바깥 편집 127.0.0.1 | ||
|---|---|---|---|
| 줄 1: | 줄 1: | ||
| - | ====== | + | ====== |
| * 참고 | * 참고 | ||
| + | * https:// | ||
| * https:// | * https:// | ||
| * https:// | * https:// | ||
| 줄 7: | 줄 8: | ||
| * https:// | * https:// | ||
| + | ===== V2 ===== | ||
| + | <code python gpt_v2.py> | ||
| + | |||
| + | import argparse | ||
| + | import math | ||
| + | |||
| + | import numpy as np | ||
| + | import plotille | ||
| + | import torch | ||
| + | import torch.nn as nn | ||
| + | import torch.nn.functional as F | ||
| + | import torch.optim as optim | ||
| + | import tqdm | ||
| + | from gr.pygr import mlab | ||
| + | from IPython import embed | ||
| + | from torch.utils.data import Dataset | ||
| + | from torch.utils.data.dataloader import DataLoader | ||
| + | |||
| + | |||
| + | def parse_args(): | ||
| + | parser = argparse.ArgumentParser() | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | return parser.parse_args() | ||
| + | |||
| + | args = parse_args() | ||
| + | |||
| + | |||
| + | class CausalSelfAttention(nn.Module): | ||
| + | """ | ||
| + | https:// | ||
| + | """ | ||
| + | |||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | assert d_model % n_head == 0 | ||
| + | # key, query, value projections for all heads | ||
| + | self.key = nn.Linear(d_model, | ||
| + | self.query = nn.Linear(d_model, | ||
| + | self.value = nn.Linear(d_model, | ||
| + | # regularization | ||
| + | self.attn_drop = nn.Dropout(dropout) | ||
| + | self.resid_drop = nn.Dropout(dropout) | ||
| + | # output projection | ||
| + | self.proj = nn.Linear(d_model, | ||
| + | # causal mask to ensure that attention is only applied to the left in the input sequence | ||
| + | self.register_buffer( | ||
| + | " | ||
| + | torch.tril(torch.ones(block_size, | ||
| + | ) | ||
| + | self.n_head = n_head | ||
| + | |||
| + | def forward(self, | ||
| + | B, T, C = x.size() | ||
| + | |||
| + | # calculate query, key, values for all heads in batch and move head forward to be the batch dim | ||
| + | k = self.key(x).view(B, | ||
| + | q = self.query(x).view(B, | ||
| + | v = self.value(x).view(B, | ||
| + | |||
| + | # causal self-attention; | ||
| + | att = (q @ k.transpose(-2, | ||
| + | att = att.masked_fill(self.mask[:,:,: | ||
| + | att = F.softmax(att, | ||
| + | att = self.attn_drop(att) | ||
| + | y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) | ||
| + | y = y.transpose(1, | ||
| + | |||
| + | # output projection | ||
| + | y = self.resid_drop(self.proj(y)) | ||
| + | return y | ||
| + | |||
| + | class Block(nn.Module): | ||
| + | """ | ||
| + | |||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | self.ln1 = nn.LayerNorm(d_model) | ||
| + | self.ln2 = nn.LayerNorm(d_model) | ||
| + | self.attn = CausalSelfAttention(d_model, | ||
| + | self.mlp = nn.Sequential( | ||
| + | nn.Linear(d_model, | ||
| + | nn.GELU(), | ||
| + | nn.Linear(4 * d_model, d_model), | ||
| + | nn.Dropout(dropout), | ||
| + | ) | ||
| + | |||
| + | def forward(self, | ||
| + | x = x + self.attn(self.ln1(x)) | ||
| + | x = x + self.mlp(self.ln2(x)) | ||
| + | return x | ||
| + | |||
| + | |||
| + | class GPTModel(nn.Module): | ||
| + | def __init__(self, | ||
| + | super().__init__() | ||
| + | self.n_layers = 6 | ||
| + | self.n_heads = 8 | ||
| + | self.d_model = 512 | ||
| + | self.block_size = block_size | ||
| + | |||
| + | self.we = nn.Linear(input_dims, | ||
| + | self.wp = nn.Parameter(torch.zeros(1, | ||
| + | self.blocks = nn.Sequential(*[ | ||
| + | Block(self.d_model, | ||
| + | for _ in range(self.n_layers) | ||
| + | ]) | ||
| + | self.norm = nn.LayerNorm(self.d_model) | ||
| + | self.wd = nn.Linear(self.d_model, | ||
| + | |||
| + | self.apply(self._init_weights) | ||
| + | print(f' | ||
| + | |||
| + | def _init_weights(self, | ||
| + | if isinstance(module, | ||
| + | module.weight.data.normal_(mean=0.0, | ||
| + | if isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | elif isinstance(module, | ||
| + | module.bias.data.zero_() | ||
| + | module.weight.data.fill_(1.0) | ||
| + | |||
| + | def forward(self, | ||
| + | B, T, C = src.size() | ||
| + | src_embed = self.we(src) | ||
| + | pos_embed = self.wp[:, :T, :] | ||
| + | hx = src_embed + pos_embed | ||
| + | hx = self.blocks(hx) | ||
| + | hx = self.norm(hx) | ||
| + | out = self.wd(self.norm(hx)) | ||
| + | src = torch.cat([src[:, | ||
| + | return out, src | ||
| + | |||
| + | |||
| + | class BasicDataset(Dataset): | ||
| + | |||
| + | def __init__(self, | ||
| + | self.block_size = block_size | ||
| + | |||
| + | self.data = np.sin(np.arange(10240) / 10.) | ||
| + | # self.data = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 | ||
| + | # self.data = np.abs(np.sin(np.arange(10240) / 10.)) | ||
| + | # data = np.sin(np.arange(10240) / 10.) * (np.sin(np.arange(10240) / 10.) > 0.0) | ||
| + | self.data = self.data.astype(np.float32) | ||
| + | self.data = self.data.reshape(-1, | ||
| + | self.data_std = self.data.std(0) | ||
| + | self.repeat = repeat | ||
| + | self.noise_scale = noise_scale | ||
| + | | ||
| + | def __len__(self): | ||
| + | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
| + | return len(self.data) * self.repeat | ||
| + | |||
| + | def __getitem__(self, | ||
| + | # we're actually going to " | ||
| + | i = np.random.randint(0, | ||
| + | chunk = self.data[i: | ||
| + | chunk += np.random.normal(0, | ||
| + | x = torch.tensor(chunk[: | ||
| + | y = torch.tensor(chunk[1: | ||
| + | return x, y | ||
| + | |||
| + | def get_test_data(self, | ||
| + | i = np.random.randint(0, | ||
| + | idx = np.arange(i, | ||
| + | data = self.data[idx].reshape(1, | ||
| + | tgt = torch.tensor(data, | ||
| + | src = tgt[:, : | ||
| + | gen = tgt[:, : | ||
| + | return tgt, src, gen | ||
| + | |||
| + | |||
| + | class MotionDataset(Dataset): | ||
| + | | ||
| + | def __init__(self, | ||
| + | self.block_size = block_size | ||
| + | |||
| + | import urllib, json | ||
| + | url = " | ||
| + | self.data = json.loads(urllib.request.urlopen(url).read())[' | ||
| + | self.data = np.array(self.data, | ||
| + | self.data = np.hstack([self.data[:, | ||
| + | self.data = np.tile(self.data, | ||
| + | self.dims = self.data.shape[-1] | ||
| + | self.data_mean = self.data.mean(0, | ||
| + | self.data_std = self.data.std(0, | ||
| + | self.data = (self.data - self.data_mean) / self.data_std | ||
| + | |||
| + | self.data = self.data.astype(np.float32) | ||
| + | self.repeat = repeat | ||
| + | self.noise_scale = noise_scale | ||
| + | | ||
| + | def __len__(self): | ||
| + | # return math.ceil(len(self.data) / (self.block_size + 1)) | ||
| + | return len(self.data) * self.repeat | ||
| + | |||
| + | def __getitem__(self, | ||
| + | # we're actually going to " | ||
| + | i = np.random.randint(0, | ||
| + | chunk = self.data[i: | ||
| + | chunk += np.random.normal(0, | ||
| + | x = torch.tensor(chunk[: | ||
| + | y = torch.tensor(chunk[1: | ||
| + | return x, y | ||
| + | |||
| + | def get_test_data(self, | ||
| + | i = np.random.randint(0, | ||
| + | idx = np.arange(i, | ||
| + | data = self.data[idx].reshape(1, | ||
| + | tgt = torch.tensor(data, | ||
| + | src = tgt[:, : | ||
| + | gen = tgt[:, : | ||
| + | return tgt, src, gen | ||
| + | |||
| + | |||
| + | if __name__ == ' | ||
| + | |||
| + | # create the dataloader | ||
| + | Dataset = globals()[args.dataset] | ||
| + | dataset = Dataset(args.block_size, | ||
| + | loader = DataLoader(dataset, | ||
| + | | ||
| + | # create the model | ||
| + | dim = dataset.data.shape[-1] | ||
| + | model = GPTModel(dim, | ||
| + | |||
| + | # create the optimizer | ||
| + | no_decay = [" | ||
| + | params_decay = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] | ||
| + | params_nodecay = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)] | ||
| + | optim_groups = [ | ||
| + | {" | ||
| + | {" | ||
| + | ] | ||
| + | optimizer = optim.AdamW(optim_groups, | ||
| + | |||
| + | def warmup_cosine(optimizer, | ||
| + | s = float(epoch <= warmup) | ||
| + | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
| + | for param_group in optimizer.param_groups: | ||
| + | param_group[' | ||
| + | |||
| + | step = 0 | ||
| + | train_loss_list = list() | ||
| + | test_score_list = list() | ||
| + | | ||
| + | for epoch in tqdm.trange(args.max_epoch): | ||
| + | # fitting | ||
| + | model.train() | ||
| + | for i, (src, tgt) in tqdm.tqdm(enumerate(loader), | ||
| + | src, tgt = src.to(args.device), | ||
| + | |||
| + | gen, _ = model(src) | ||
| + | |||
| + | optimizer.zero_grad() | ||
| + | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
| + | loss.backward() | ||
| + | nn.utils.clip_grad_norm_(model.parameters(), | ||
| + | optimizer.step() | ||
| + | warmup_cosine(optimizer, | ||
| + | |||
| + | step += 1 / len(loader) | ||
| + | train_loss_list.append((step, | ||
| + | |||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*train_loss_list[-1000: | ||
| + | |||
| + | # eval | ||
| + | model.eval() | ||
| + | tgt, src, gen = dataset.get_test_data(args.test_steps, | ||
| + | |||
| + | with torch.no_grad(): | ||
| + | for i in range(args.test_steps - args.block_size): | ||
| + | gen_, src = model(src) | ||
| + | gen = torch.cat([gen, | ||
| + | | ||
| + | loss = (0.5 * (tgt - gen) ** 2).mean() | ||
| + | score = (-loss).exp() | ||
| + | test_score_list.append((step, | ||
| + | |||
| + | mlab.plot(tgt.cpu().numpy()[0, | ||
| + | mlab.oplot(gen.cpu().numpy()[0, | ||
| + | tqdm.tqdm.write(plotille.scatter(*zip(*test_score_list[-1000: | ||
| + | tqdm.tqdm.write(str(args)) | ||
| + | |||
| + | embed() | ||
| + | |||
| + | </ | ||
| + | |||
| + | ===== V1 ===== | ||
| <code python gpt.py> | <code python gpt.py> | ||
| 줄 12: | 줄 313: | ||
| import plotille | import plotille | ||
| import torch | import torch | ||
| - | from torch import dropout | ||
| import torch.nn as nn | import torch.nn as nn | ||
| import torch.nn.functional as F | import torch.nn.functional as F | ||
| 줄 26: | 줄 326: | ||
| parser.add_argument(' | parser.add_argument(' | ||
| parser.add_argument(' | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| + | parser.add_argument(' | ||
| return parser.parse_args() | return parser.parse_args() | ||
| 줄 153: | 줄 455: | ||
| if args.custom_block: | if args.custom_block: | ||
| self.blocks = nn.ModuleList([ | self.blocks = nn.ModuleList([ | ||
| - | CustomBlock(self.d_model, | + | CustomBlock(self.d_model, |
| ]) | ]) | ||
| else: | else: | ||
| self.blocks = nn.ModuleList([ | self.blocks = nn.ModuleList([ | ||
| - | Block(self.d_model, | + | Block(self.d_model, |
| ]) | ]) | ||
| 줄 193: | 줄 495: | ||
| n_epochs = 2500 | n_epochs = 2500 | ||
| - | prev_steps = 64 | + | prev_steps = 32 |
| - | next_steps = 64 | + | next_steps = 2 |
| test_steps = 512 | test_steps = 512 | ||
| - | bsz = 8 # 4 # 128 # 배치 작아야 함 | + | bsz = 32 # 8 # 4 # 128 |
| device = ' | device = ' | ||
| - | dataset = np.sin(np.arange(4096) / 10.) | + | dataset = np.sin(np.arange(10240) / 10.) * 0.5 + 2.5 |
| | | ||
| model = GPTModel(1, 1, prev_steps + next_steps).to(device) | model = GPTModel(1, 1, prev_steps + next_steps).to(device) | ||
| - | optimizer = optim.Adam(model.parameters(), | + | optimizer = optim.Adam(model.parameters(), |
| - | scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, | + | |
| + | |||
| + | def warmup_cosine(optimizer, | ||
| + | s = float(epoch <= warmup) | ||
| + | w = s*(epoch / warmup) + (1-s)*(0.5 * (1 + np.cos(np.pi * epoch))) | ||
| + | for param_group in optimizer.param_groups: | ||
| + | param_group[' | ||
| step = 0 | step = 0 | ||
| 줄 231: | 줄 539: | ||
| loss.backward() | loss.backward() | ||
| optimizer.step() | optimizer.step() | ||
| - | scheduler.step(epoch + i / len(idxes)) | + | |
| + | warmup_cosine(optimizer, | ||
| step += 1 / len(idxes) | step += 1 / len(idxes) | ||
| 줄 263: | 줄 572: | ||
| embed() | embed() | ||
| - | |||
| </ | </ | ||
| {{tag> | {{tag> | ||
code/gpt_example.1596577712.txt.gz · 마지막으로 수정됨: (바깥 편집)