Skip to content

Commit efa0bc6

Browse files
committed
Added metrics script
1 parent ad4cf6f commit efa0bc6

File tree

2 files changed

+102
-1
lines changed

2 files changed

+102
-1
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
venv
22
results
33
__pycache__/
4-
metrics*
4+
metrics.ipynb
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
import torch
3+
from tqdm import tqdm
4+
import os
5+
import click
6+
import logging
7+
from rich.logging import RichHandler
8+
from datasets import load_dataset
9+
10+
FORMAT = "%(message)s"
11+
logging.basicConfig(
12+
level="NOTSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
13+
)
14+
log = logging.getLogger("rich")
15+
16+
torch.manual_seed(42)
17+
18+
19+
@click.command()
20+
@click.option(
21+
"--filename",
22+
default="netflix_titles_small.csv",
23+
help="CSV file name",
24+
show_default=True,
25+
)
26+
@click.option(
27+
"--column_name", default="description", help="CSV column name", show_default=True
28+
)
29+
@click.option(
30+
"--model_name",
31+
default="distilgpt2",
32+
help="Hugging face model or path to the model",
33+
show_default=True,
34+
)
35+
@click.option(
36+
"--tokenizer_name",
37+
default="distilgpt2",
38+
help="Hugging face tokenizer or path to the tokenizer",
39+
show_default=True,
40+
)
41+
@click.option(
42+
"--stride",
43+
default=512,
44+
help="Stride length for computing perplexity",
45+
show_default=True,
46+
)
47+
def metrics(filename, column_name, model_name, tokenizer_name, stride):
48+
49+
if os.path.exists(filename):
50+
test = load_dataset("csv", data_files=filename)
51+
else:
52+
msg = "File %s doesn't exist"%filename
53+
raise ValueError(msg)
54+
55+
cuda_available = torch.cuda.is_available()
56+
57+
device = "cpu"
58+
if cuda_available:
59+
device = "cuda"
60+
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
61+
else:
62+
model = AutoModelForCausalLM.from_pretrained(model_name)
63+
64+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
65+
66+
67+
encodings = tokenizer("\n\n".join(test["train"][column_name]), return_tensors="pt")
68+
69+
max_length = model.config.n_positions
70+
stride = min(stride, max_length)
71+
seq_len = encodings.input_ids.size(1)
72+
73+
nlls = []
74+
prev_end_loc = 0
75+
for begin_loc in tqdm(range(0, seq_len, stride)):
76+
end_loc = min(begin_loc + max_length, seq_len)
77+
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
78+
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
79+
target_ids = input_ids.clone()
80+
target_ids[:, :-trg_len] = -100
81+
82+
with torch.no_grad():
83+
outputs = model(input_ids, labels=target_ids)
84+
85+
# loss is calculated using CrossEntropyLoss which averages over input tokens.
86+
# Multiply it with trg_len to get the summation instead of average.
87+
# We will take average over all the tokens to get the true average
88+
# in the last step of this example.
89+
neg_log_likelihood = outputs.loss * trg_len
90+
91+
nlls.append(neg_log_likelihood)
92+
93+
prev_end_loc = end_loc
94+
if end_loc == seq_len:
95+
break
96+
97+
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
98+
log.info("Perplexity = %0.3f (lower is better)" % ppl)
99+
100+
if __name__ == "__main__":
101+
metrics()

0 commit comments

Comments
 (0)