-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
executable file
·136 lines (112 loc) · 3.97 KB
/
utils.py
File metadata and controls
executable file
·136 lines (112 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import json
import torch
import random
def to_gpu(gpu, var):
if gpu:
return var.cuda()
return var
class Dictionary(object):
pad = 0
sos = 1
eos = 2
oov = 3
offset = 4
def __init__(self, dict_file, vocab_size):
words = []
with open(dict_file) as f:
for line in f:
if line == '':
continue
words.append(line.split(',')[0])
self.word2idx = {}
self.word2idx['<pad>'] = self.pad
self.word2idx['<sos>'] = self.sos
self.word2idx['<eos>'] = self.eos
self.word2idx['<oov>'] = self.oov
for idx, word in enumerate(words[:vocab_size]):
self.word2idx[word] = idx + self.offset
self.idx2word = {v: k for k, v in self.word2idx.items()}
def __len__(self):
return len(self.word2idx)
def __getitem__(self, key):
if type(key) is str:
if key in self.word2idx:
return self.word2idx[key]
else:
# print(key, [ord(c) for c in key])
# exit()
return self.oov
if type(key) is int:
if key in self.idx2word:
return self.idx2word[key]
raise KeyError(key)
class Corpus(object):
def __init__(self, data_file, dict_file, vocab_size=11000):
self.dictionary = Dictionary(dict_file, vocab_size)
with open(data_file) as f:
raw = json.load(f)
self.raw = raw
def get_data(self, split=None):
data = [[self.dictionary.sos] +
[self.dictionary[w] for w in line] +
[self.dictionary.eos]
for chunk in self.raw for line in chunk]
if not split:
return data
else:
train_size = int(len(data) * max(split, 1 - split))
return data[:train_size], data[train_size:]
def get_chunks(self, size):
chunks = []
for chunk in self.raw:
if len(chunk) > size:
# source has no end symbol
chunk = [[self.dictionary.sos] +
[self.dictionary[w] for w in line]
for line in chunk]
for i in range(len(chunk) - size + 1):
chunks.append(chunk[i: i + size])
return chunks
def batchify(data, bsz, shuffle=False):
if shuffle:
random.shuffle(data)
nbatch = len(data) // bsz
batches = []
for i in range(nbatch):
batch = data[i * bsz:(i + 1) * bsz]
# source has no end symbol
source = [x[:-1] for x in batch]
# target has no start symbol
target = [x[1:] for x in batch]
# find length to pad to, subtract 1 from lengths b/c includes BOTH starts & end symbols
lengths = [len(x) - 1 for x in batch]
maxlen = max(lengths)
for x, y in zip(source, target):
padding = (maxlen - len(x)) * [Dictionary.pad]
x += padding
y += padding
source = torch.LongTensor(source)
target = torch.LongTensor(target)
# substract 1 for length embedding indexing
lengths = torch.LongTensor([x - 1 for x in lengths])
batches.append((source, target, lengths))
return batches
class BatchGen(object):
"""Generate endlessly the pair of current line and next line"""
def __init__(self, chunks, batch_size):
super(BatchGen, self).__init__()
self.chunks = chunks
self.batch_size = batch_size
def pad_data(self, data):
maxlen = max(map(len, data))
return [x + (maxlen - len(x)) * [Dictionary.pad] for x in data]
def __iter__(self):
return self
def __next__(self):
batch = random.sample(self.chunks, self.batch_size)
source, target = list(zip(*batch))
source = self.pad_data(source)
target = self.pad_data(target)
source = torch.LongTensor(source)
target = torch.LongTensor(target)
return source, target