Commit 095abbdd authored by Dennis Müller's avatar Dennis Müller
Browse files

python files

parent 2c40ffd4
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from util import server
import logging
out_folder = "./l2s"
tokenizer = GPT2Tokenizer.from_pretrained(out_folder)
gpt = GPT2LMHeadModel.from_pretrained(out_folder)
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
n_gpu = 0 if device == "cpu" else torch.cuda.device_count()
logger = logging.getLogger(__name__)
gpt = gpt.to(device)
def main():
logger.info("Halo, I bims")
server(tokenizer,gpt)
if __name__ == '__main__':
main()
\ No newline at end of file
import os
from more_itertools import split_at
from transformers import PreTrainedTokenizer,GPT2Tokenizer
import torch
from transformers import GPT2LMHeadModel, PreTrainedModel
from transformers import AdamW, get_linear_schedule_with_warmup
from typing import Dict, List, Tuple
from torch.utils.tensorboard import SummaryWriter
import logging
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset, RandomSampler
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.DEBUG,
filemode='w'
)
logger.addHandler(logging.FileHandler("l2s.log"))
n_gpu = 0 if device == "cpu" else torch.cuda.device_count()
# TRAIN ----------------------------------------------------------------------------------------------------------------
def train(
train_dataset,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
out_folder,
per_gpu_batch_size = 2,
gradient_accumulation_steps = 32,
weight_decay = 0.0,
learning_rate = 5e-4,
adam_epsilon = 1e-8,
warmup_steps = 0,
num_train_epochs = 1,
max_grad_norm=1.0,
logging_steps = 50,
save_steps = 50,
save_total_limit = 5
) -> Tuple[int, float]:
""" Train the model """
tb_writer = SummaryWriter(log_dir=out_folder + "/logs")
model = model.to(device)
train_batch_size = per_gpu_batch_size * max(1, n_gpu)
from torch.nn.utils.rnn import pad_sequence
def collate(examples: List[torch.Tensor]):
# return examples
if tokenizer._pad_token is None:
return pad_sequence(examples, batch_first=True)
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
train_dataset, sampler=train_sampler, batch_size=train_batch_size, collate_fn=collate
)
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)
# Check if saved optimizer or scheduler states exist
if (
out_folder
and os.path.isfile(os.path.join(out_folder, "optimizer.pt"))
and os.path.isfile(os.path.join(out_folder, "scheduler.pt"))
):
# Load in optimizer and scheduler states
optimizer.load_state_dict(torch.load(os.path.join(out_folder, "optimizer.pt")))
scheduler.load_state_dict(torch.load(os.path.join(out_folder, "scheduler.pt")))
# multi-gpu training (should be after apex fp16 initialization)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d", per_gpu_batch_size)
logger.info(
" Total train batch size (w. parallel, distributed & accumulation) = %d",
train_batch_size
* gradient_accumulation_steps
)
logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
epochs_trained = 0
steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint
if out_folder and os.path.exists(out_folder):
try:
# set global_step to gobal_step of last saved checkpoint from model path
checkpoint_suffix = out_folder.split("-")[-1].split("/")[0]
global_step = int(checkpoint_suffix)
epochs_trained = global_step // (len(train_dataloader) // gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // gradient_accumulation_steps)
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
logger.info(" Continuing training from epoch %d", epochs_trained)
logger.info(" Continuing training from global step %d", global_step)
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
except ValueError:
logger.info(" Starting fine-tuning.")
tr_loss, logging_loss = 0.0, 0.0
model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
model_to_resize.resize_token_embeddings(len(tokenizer))
model.zero_grad()
train_iterator = trange(
epochs_trained, int(num_train_epochs), desc="Epoch", disable=False
)
import random
import numpy as np
def set_seed():
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if n_gpu > 0:
torch.cuda.manual_seed_all(42)
def _sorted_checkpoints(checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
ordering_and_checkpoint_path = []
import glob
import re
glob_checkpoints = glob.glob(os.path.join(out_folder, "{}-*".format(checkpoint_prefix)))
for path in glob_checkpoints:
if use_mtime:
ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
else:
regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
if regex_match and regex_match.groups():
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
return checkpoints_sorted
def _rotate_checkpoints(checkpoint_prefix="checkpoint", use_mtime=False) -> None:
import shutil
if not save_total_limit:
return
if save_total_limit <= 0:
return
# Check if we should delete older checkpoint(s)
checkpoints_sorted = _sorted_checkpoints(checkpoint_prefix, use_mtime)
if len(checkpoints_sorted) <= save_total_limit:
return
number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
for checkpoint in checkpoints_to_be_deleted:
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
shutil.rmtree(checkpoint)
set_seed() # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
for step, batch in enumerate(epoch_iterator):
# Skip past any already trained steps if resuming training
if steps_trained_in_current_epoch > 0:
steps_trained_in_current_epoch -= 1
continue
inputs, labels = (batch, batch) #mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
inputs = inputs.to(device)
labels = labels.to(device)
model.train()
outputs = model(inputs, labels = labels) # model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if gradient_accumulation_steps > 1:
loss = loss / gradient_accumulation_steps
loss.backward()
tr_loss += loss.item()
if (step + 1) % gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()
global_step += 1
if logging_steps > 0 and global_step % logging_steps == 0:
# Log metrics
# if (
# args.local_rank == -1 and args.evaluate_during_training
# ): # Only evaluate when single GPU otherwise metrics may not average well
# results = evaluate(args, model, tokenizer)
# for key, value in results.items():
# tb_writer.add_scalar("eval_{}".format(key), value, global_step)
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / logging_steps, global_step)
logging_loss = tr_loss
if save_steps > 0 and global_step % save_steps == 0:
checkpoint_prefix = "checkpoint"
# Save model checkpoint
output_dir = os.path.join(out_folder, "{}-{}".format(checkpoint_prefix, global_step))
os.makedirs(output_dir, exist_ok=True)
model_to_save = (
model.module if hasattr(model, "module") else model
) # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# torch.save(args, os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir)
if True:
import math
failed = math.isnan(GPT2LMHeadModel.from_pretrained(output_dir).transformer.wte.weight[0][1].item())
if failed:
logger.warning("Saved checkpoint contains NaNs!!")
raise ValueError
else:
logger.info("Saving successful")
_rotate_checkpoints(checkpoint_prefix)
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
logger.info("Saving optimizer and scheduler states to %s", output_dir)
tb_writer.close()
# output_dir = os.path.join(out_folder, "{}-{}".format(checkpoint_prefix, global_step))
os.makedirs(out_folder, exist_ok=True)
model_to_save = (
model.module if hasattr(model, "module") else model
) # Take care of distributed/parallel training
model_to_save.save_pretrained(out_folder)
tokenizer.save_pretrained(out_folder)
if True:
import math
failed = math.isnan(GPT2LMHeadModel.from_pretrained(out_folder).transformer.wte.weight[0][1].item())
if failed:
logger.warning("Saved checkpoint contains NaNs!!")
raise ValueError
else:
logger.info("Saving successful")
return global_step, tr_loss / global_step
# ----------------------------------------------------------------------------------------------------------------------
class QueryDataset(Dataset):
@staticmethod
def replace(s : str):
return s.replace("$$","$").replace("\\[","$").replace("\\]","$")
def pad(self,t,size):
nt = torch.ones(size,dtype=torch.long)
nt[0:t.size(0)] = t
nt[t.size(0):] = 1
return nt
def __init__(self, tokenizer, file, block_size: int = 1024, istest : bool = False):
self.tokenizer = tokenizer
self.list = list(self._getIterator(file,block_size,istest))
def _getIterator(self, file, block_size, istest):
for d in split_at(open(file), lambda x: (x == '<doc>\n')):
als = list(split_at(d, lambda x: (x == '</s>\n')))
sentence = self.replace(''.join(als[0]).strip())
sentence_ids = self.tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=100000)["input_ids"]
ls = [''.join(p).strip() for p in list(split_at(als[1], lambda x: (x == '\n')))]
latexs = ls[0:][::2]
stexs = ls[1:][::2]
for i in range(len(latexs)):
latex_ids = self.tokenizer.encode_plus(self.replace(latexs[i]), add_special_tokens=True,max_length=100000)["input_ids"]
stex_ids = self.tokenizer.encode_plus(self.replace(stexs[i]), add_special_tokens=True,max_length=100000)["input_ids"]
if istest:
yield {
"ids" : torch.tensor(
sentence_ids + [0] + latex_ids + [0],
dtype=torch.long
),
"sentence" : sentence,
"latex": self.replace(latexs[i]),
"stex": self.replace(stexs[i]),
}
else:
ret = torch.tensor(
sentence_ids + [0] + latex_ids + [0] + stex_ids + [0,1],
dtype=torch.long
)
if ret.size(0) <= block_size:
yield self.pad(ret,block_size)
def __len__(self):
return len(self.list)
def __getitem__(self, index):
return self.list[index]
# train(dataset,gpt,tokenizer, per_gpu_batch_size=1, gradient_accumulation_steps=64,num_train_epochs=3)
import socket
HOST = '127.0.0.1'
PORT = 65432
def server(tokenizer,gpt):
with socket.socket(socket.AF_INET,socket.SOCK_STREAM) as s:
s.bind((HOST, PORT))
logger.info("Server running")
s.listen()
conn, addr = s.accept()
with conn:
logger.info("Connected by " + str(addr))
while True:
logger.info("Waiting for ping...")
data = conn.recv(4096)
logger.info("Ping: " + str(data))
data = data.decode("utf-8")#.strip()
if data=="close" or data=="":
logger.info("Closing")
break
returns = []
data = data.strip()
if not data=="":
logger.info("Received: " + data)
seq = data.split("<s>")
sentence = seq[0].strip()
logger.info("Sentence: " + sentence)
sentence_ids = tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=100000)["input_ids"]
for s in seq[1:]:
math = s.strip()
logger.info("Math: " + math)
latex_ids = tokenizer.encode_plus(QueryDataset.replace(math), add_special_tokens=True,max_length=100000)["input_ids"]
input = torch.tensor(
sentence_ids + [0] + latex_ids + [0],
dtype=torch.long
)
ret = gpt.generate(
input_ids=input.unsqueeze(0).to(device),
max_length=500)[0]
ret = tokenizer.decode(ret[input.size(0):-2]).replace("\n","")
logger.info("Result: " + ret)
returns += [ret]
returns = "<s>".join(returns) + "\r\n"
logger.info("Returning: " + returns)
conn.sendall(returns.encode("utf-8"))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment