L1-Refinement/utils.py at main · Pzoom522/L1-Refinement

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

#

# This source code is licensed under the license found in the

# LICENSE file in the root directory of this source tree.

#

import os

import io

import re

import sys

import pickle

import random

import inspect

import argparse

import subprocess

import numpy as np

import torch

from torch import optim

from dictionary import Dictionary

MAIN_DUMP_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'dumped')

# load Faiss if available (dramatically accelerates the nearest neighbor search)

try:

import faiss

FAISS_AVAILABLE = True

if not hasattr(faiss, 'StandardGpuResources'):

sys.stderr.write("Impossible to import Faiss-GPU. "

"Switching to FAISS-CPU, "

"this will be slower.\n\n")

except ImportError:

sys.stderr.write("Impossible to import Faiss library!! "

"Switching to standard nearest neighbors search implementation, "

"this will be significantly slower.\n\n")

FAISS_AVAILABLE = False

def load_fasttext_model(path):

"""

Load a binarized fastText model.

"""

try:

import fastText

except ImportError:

raise Exception("Unable to import fastText. Please install fastText for Python: "

"https://github.com/facebookresearch/fastText")

return fastText.load_model(path)

def bow(sentences, word_vec, normalize=False):

"""

Get sentence representations using average bag-of-words.

"""

embeddings = []

for sent in sentences:

sentvec = [word_vec[w] for w in sent if w in word_vec]

if normalize:

sentvec = [v / np.linalg.norm(v) for v in sentvec]

if len(sentvec) == 0:

sentvec = [word_vec[list(word_vec.keys())[0]]]

embeddings.append(np.mean(sentvec, axis=0))

return np.vstack(embeddings)

def bow_idf(sentences, word_vec, idf_dict=None):

"""

Get sentence representations using weigthed IDF bag-of-words.

"""

embeddings = []

for sent in sentences:

sent = set(sent)

list_words = [w for w in sent if w in word_vec and w in idf_dict]

if len(list_words) > 0:

sentvec = [word_vec[w] * idf_dict[w] for w in list_words]

sentvec = sentvec / np.sum([idf_dict[w] for w in list_words])

else:

sentvec = [word_vec[list(word_vec.keys())[0]]]

embeddings.append(np.sum(sentvec, axis=0))

return np.vstack(embeddings)

def get_idf(europarl, src_lg, tgt_lg, n_idf):

"""

Compute IDF values.

"""

idf = {src_lg: {}, tgt_lg: {}}

k = 0

for lg in idf:

start_idx = 200000 + k * n_idf

end_idx = 200000 + (k + 1) * n_idf

for sent in europarl[lg][start_idx:end_idx]:

for word in set(sent):

idf[lg][word] = idf[lg].get(word, 0) + 1

n_doc = len(europarl[lg][start_idx:end_idx])

for word in idf[lg]:

idf[lg][word] = max(1, np.log10(n_doc / (idf[lg][word])))

k += 1

return idf

def get_nn_avg_dist(emb, query, knn):

"""

Compute the average distance of the `knn` nearest neighbors

for a given set of embeddings and queries.

Use Faiss if available.

"""

if FAISS_AVAILABLE:

emb = emb.cpu().numpy()

query = query.cpu().numpy()

if hasattr(faiss, 'StandardGpuResources'):

# gpu mode

res = faiss.StandardGpuResources()

config = faiss.GpuIndexFlatConfig()

config.device = 0

index = faiss.GpuIndexFlatIP(res, emb.shape[1], config)

else:

# cpu mode

index = faiss.IndexFlatIP(emb.shape[1])

index.add(emb)

distances, _ = index.search(query, knn)

return distances.mean(1)

else:

bs = 1024

all_distances = []

emb = emb.transpose(0, 1).contiguous()

for i in range(0, query.shape[0], bs):

distances = query[i:i + bs].mm(emb)

best_distances, _ = distances.topk(knn, dim=1, largest=True, sorted=True)

all_distances.append(best_distances.mean(1).cpu())

all_distances = torch.cat(all_distances)

return all_distances.numpy()

def bool_flag(s):

"""

Parse boolean arguments from the command line.

"""

if s.lower() in ['off', 'false', '0']:

return False

if s.lower() in ['on', 'true', '1']:

return True

raise argparse.ArgumentTypeError("invalid value for a boolean flag (0 or 1)")

def get_optimizer(s):

"""

Parse optimizer parameters.

Input should be of the form:

- "sgd,lr=0.01"

- "adagrad,lr=0.1,lr_decay=0.05"

"""

if "," in s:

method = s[:s.find(',')]

optim_params = {}

for x in s[s.find(',') + 1:].split(','):

split = x.split('=')

assert len(split) == 2

assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None

optim_params[split[0]] = float(split[1])

else:

method = s

optim_params = {}

if method == 'adadelta':

optim_fn = optim.Adadelta

elif method == 'adagrad':

optim_fn = optim.Adagrad

elif method == 'adam':

optim_fn = optim.Adam

elif method == 'adamax':

optim_fn = optim.Adamax

elif method == 'asgd':

optim_fn = optim.ASGD

elif method == 'rmsprop':

optim_fn = optim.RMSprop

elif method == 'rprop':

optim_fn = optim.Rprop

elif method == 'sgd':

optim_fn = optim.SGD

assert 'lr' in optim_params

else:

raise Exception('Unknown optimization method: "%s"' % method)

# check that we give good parameters to the optimizer

expected_args = inspect.getargspec(optim_fn.__init__)[0]

assert expected_args[:2] == ['self', 'params']

if not all(k in expected_args[2:] for k in optim_params.keys()):

raise Exception('Unexpected parameters: expected "%s", got "%s"' % (

str(expected_args[2:]), str(optim_params.keys())))

return optim_fn, optim_params

def clip_parameters(model, clip):

"""

Clip model weights.

"""

if clip > 0:

for x in model.parameters():

x.data.clamp_(-clip, clip)

def read_txt_embeddings(params, source, full_vocab):

"""

Reload pretrained embeddings from a text file.

"""

word2id = {}

vectors = []

# load pretrained embeddings

lang = params.src_lang if source else params.tgt_lang

emb_path = params.src_emb if source else params.tgt_emb

_emb_dim_file = params.emb_dim

with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:

for i, line in enumerate(f):

if i == 0:

split = line.split()

assert len(split) == 2

assert _emb_dim_file == int(split[1])

else:

word, vect = line.rstrip().split(' ', 1)

if not full_vocab:

word = word.lower()

vect = np.fromstring(vect, sep=' ')

if np.linalg.norm(vect) == 0: # avoid to have null embeddings

vect[0] = 0.01

if word in word2id:

if full_vocab:

print("Word '%s' found twice in %s embedding file"

% (word, 'source' if source else 'target'))

else:

if not vect.shape == (_emb_dim_file,):

print("Invalid dimension (%i) for %s word '%s' in line %i."

% (vect.shape[0], 'source' if source else 'target', word, i))

continue

assert vect.shape == (_emb_dim_file,), i

word2id[word] = len(word2id)

vectors.append(vect[None])

if params.max_vocab > 0 and len(word2id) >= params.max_vocab and not full_vocab:

break

assert len(word2id) == len(vectors)

print("Loaded %i pre-trained word embeddings." % len(vectors))

# compute new vocabulary / embeddings

id2word = {v: k for k, v in word2id.items()}

dico = Dictionary(id2word, word2id, lang)

embeddings = np.concatenate(vectors, 0)

embeddings = torch.from_numpy(embeddings).float()

assert embeddings.size() == (len(dico), params.emb_dim)

return dico, embeddings

def select_subset(word_list, max_vocab):

"""

Select a subset of words to consider, to deal with words having embeddings

available in different casings. In particular, we select the embeddings of

the most frequent words, that are usually of better quality.

"""

word2id = {}

indexes = []

for i, word in enumerate(word_list):

word = word.lower()

if word not in word2id:

word2id[word] = len(word2id)

indexes.append(i)

if max_vocab > 0 and len(word2id) >= max_vocab:

break

assert len(word2id) == len(indexes)

return word2id, torch.LongTensor(indexes)

def load_embeddings(params, source, full_vocab=False):

"""

Reload pretrained embeddings.

- `full_vocab == False` means that we load the `params.max_vocab` most frequent words.

It is used at the beginning of the experiment.

In that setting, if two words with a different casing occur, we lowercase both, and

only consider the most frequent one. For instance, if "London" and "london" are in

the embeddings file, we only consider the most frequent one, (in that case, probably

London). This is done to deal with the lowercased dictionaries.

- `full_vocab == True` means that we load the entire embedding text file,

before we export the embeddings at the end of the experiment.

"""

assert type(source) is bool and type(full_vocab) is bool

emb_path = params.src_emb if source else params.tgt_emb

if emb_path.endswith('.pth'):

return load_pth_embeddings(params, source, full_vocab)

if emb_path.endswith('.bin'):

return load_bin_embeddings(params, source, full_vocab)

else:

return read_txt_embeddings(params, source, full_vocab)

def normalize_embeddings(emb, types, mean=None):

"""

Normalize embeddings by their norms / recenter them.

"""

for t in types.split(','):

if t == '':

continue

if t == 'center':

if mean is None:

mean = emb.mean(0, keepdim=True)

emb.sub_(mean.expand_as(emb))

elif t == 'renorm':

emb.div_(emb.norm(2, 1, keepdim=True).expand_as(emb))

else:

raise Exception('Unknown normalization type: "%s"' % t)

return mean.cpu() if mean is not None else None

def export_embeddings(src_emb, tgt_emb, params):

"""

Export embeddings to a text file.

"""

src_path = os.path.join(params.exp_path, 'vectors-%s.txt' % params.src_lang)

tgt_path = os.path.join(params.exp_path, 'vectors-%s.txt' % params.tgt_lang)

# source embeddings

print('Writing source embeddings to %s ...' % src_path)

with io.open(src_path, 'w', encoding='utf-8') as f:

f.write(u"%i %i\n" % src_emb.size())

for i in range(len(params.src_dico)):

f.write(u"%s %s\n" % (params.src_dico[i], " ".join('%.5f' % x for x in src_emb[i])))

# target embeddings

print('Writing target embeddings to %s ...' % tgt_path)

with io.open(tgt_path, 'w', encoding='utf-8') as f:

f.write(u"%i %i\n" % tgt_emb.size())

for i in range(len(params.tgt_dico)):

f.write(u"%s %s\n" % (params.tgt_dico[i], " ".join('%.5f' % x for x in tgt_emb[i])))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

utils.py

utils.py

Files

utils.py

Latest commit

History

utils.py

File metadata and controls