Co-Evolve-LLMs/data_augmentation/word.py at main · SuperBruceJia/Co-Evolve-LLMs

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

# coding=utf-8

import re

import random

import nltk

from nltk.corpus import stopwords

from nltk.corpus import wordnet

nltk.download('stopwords', download_dir="./save_folder/nltk")

nltk.download('punkt', download_dir="./save_folder/nltk")

nltk.download('wordnet', download_dir="./save_folder/nltk")

special_symbols = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"

# Stop words list

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our',

'ours', 'ourselves', 'you', 'your', 'yours',

'yourself', 'yourselves', 'he', 'him', 'his',

'himself', 'she', 'her', 'hers', 'herself',

'it', 'its', 'itself', 'they', 'them', 'their',

'theirs', 'themselves', 'what', 'which', 'who',

'whom', 'this', 'that', 'these', 'those', 'am',

'is', 'are', 'was', 'were', 'be', 'been', 'being',

'have', 'has', 'had', 'having', 'do', 'does', 'did',

'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',

'because', 'as', 'until', 'while', 'of', 'at',

'by', 'for', 'with', 'about', 'against', 'between',

'into', 'through', 'during', 'before', 'after',

'above', 'below', 'to', 'from', 'up', 'down', 'in',

'out', 'on', 'off', 'over', 'under', 'again',

'further', 'then', 'once', 'here', 'there', 'when',

'where', 'why', 'how', 'all', 'any', 'both', 'each',

'few', 'more', 'most', 'other', 'some', 'such', 'no',

'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',

'very', 's', 't', 'can', 'will', 'just', 'don',

'should', 'now']

def get_synonyms(word):

"""

Get synonyms for a word

"""

synonyms = []

for syn in wordnet.synsets(word):

for lemma in syn.lemmas():

synonyms.append(lemma.name())

return synonyms

class WordPerturb:

"""

Word-level Prompt Perturbation

WordPerturb class for manipulating words in a sentence

NOTE: the number of words in a sentence is only the valid words

without considering spaces, special symbols, and punctuations

"""

def __init__(self, sentence, level):

# Original sentence

self.sentence = sentence

# Tokenize the sentence into words

# self.words = word_tokenize(sentence)

self.words = re.findall(r"\w+|\s+|[^\w\s]", self.sentence, re.UNICODE)

# The perturbation level to be implemented

# use the regular expression pattern \b\w+\b to match words

# \b matches word boundaries

# \w+ matches one or more word characters

self.valid_words = re.findall(r'\b\w+\b', self.sentence)

self.num = int(len(self.valid_words) * level)

# Get the set of English stop words

self.stop_words = set(stopwords.words('english'))

self.stop_words.add(" ")

for symbol in special_symbols:

self.stop_words.add(symbol)

for stop in stop_words:

self.stop_words.add(stop)

def synonym_replacement(self):

"""

Randomly choose n words from the sentence that are not stop words.

Replace each of these words with one of its synonyms chosen at random.

# Problem 1: Without any synonyms

# Problem 2: Fewer positions than needed positions

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words[:]

# Create a list of positions that correspond to the non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

# Remove those positions that don't have any synonym

for i in range(len(positions)):

word_ori = self.words[positions[i]]

synonyms = get_synonyms(word_ori)

if len(synonyms) == 0:

positions[i] = -1

while -1 in positions:

positions.remove(-1)

# Randomly sample `self.num` positions

if self.num >= len(positions):

pass

else:

positions = random.sample(positions, self.num)

# Return the original sentence if all the words don't have synonyms

if len(positions) == 0:

return ''.join(sen_list)

else:

# Replace chosen words with random synonyms

for index in positions:

word_ori = self.words[index]

synonyms = get_synonyms(word_ori)

# Randomly retrieve one synonym from all the synonyms

synonym = random.choice(synonyms)

if '_' in synonym:

synonym = synonym.replace('_', ' ')

sen_list[index] = synonym

# Join the modified words back into a string

sen_list = ''.join(sen_list)

return sen_list

def word_insertion(self):

"""

Find a random synonym of a random word in the sentence that is not a stop word.

Insert that synonym into a random position in the sentence.

Do this n times.

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words.copy()

# Create a list of positions that correspond to the non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

non_sw_posi = positions.copy()

# Remove those positions that don't have any synonym

for i in range(len(positions)):

word_ori = self.words[positions[i]]

synonyms = get_synonyms(word_ori)

if len(synonyms) == 0:

positions[i] = -1

while -1 in positions:

positions.remove(-1)

# Return the original sentence if all the words don't have synonyms

if len(positions) == 0:

return ''.join(sen_list)

else:

# Randomly sample `self.num` positions

if self.num >= len(positions):

pass

else:

positions = random.sample(positions, self.num)

# Initialize an empty list to store the modified sentence.

sen_init = []

positions_insert = random.sample(non_sw_posi, self.num)

for index, word in enumerate(sen_list):

# Check if the current word's index is in the list of chosen indices.

if index in positions_insert:

# Insert a random character in front of the chosen character.

random_index = random.sample(positions, 1)

word_ori = self.words[random_index[0]]

synonyms = get_synonyms(word_ori)

# Randomly retrieve one synonym from all the synonyms

word_insert = random.choice(synonyms)

if '_' in word_insert:

word_insert = word_insert.replace('_', ' ')

sen_init.append(word_insert)

sen_init.append(" ")

sen_init.append(word)

# Convert the modified list back to a string.

sen_init = ''.join(sen_init)

return sen_init

def word_swap(self):

"""

Randomly choose two words in the sentence and swap their positions.

Do this n times.

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words.copy()

# Create a list of positions that correspond to non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

# Choose random positions 1 and 2 for swapping

position_1 = random.sample(positions, self.num)

position_2 = random.sample(positions, self.num)

# Perform the character swaps

for i in range(self.num):

sen_list[position_1[i]], sen_list[position_2[i]] = sen_list[position_2[i]], sen_list[position_1[i]]

# Convert the list of characters back to a string

sen_list = ''.join(sen_list)

return sen_list

def word_deletion(self):

"""

Each word in the sentence can be randomly removed with probability p.

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words.copy()

# Create a list of positions that correspond to non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

# Randomly sample `self.num` positions from all positions

positions = random.sample(positions, self.num)

# Sort the indices in reverse order so that deletion doesn't affect subsequent indices

positions.sort(reverse=True)

# Delete the selected characters

for index in positions:

del sen_list[index]

# Join the modified characters back into a string

sen_list = ''.join(sen_list)

return sen_list

def insert_punctuation(self):

"""

Randomly insert punctuation in the sentence with probability p.

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words.copy()

# Create a list of positions that correspond to non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

if self.num >= len(positions):

pass

else:

# Randomly sample `self.num` positions from all positions

positions = random.sample(positions, self.num)

# Initialize an empty list to store the modified sentence.

sen_init = []

for index, word in enumerate(self.words):

sen_init.append(word)

if len(positions) == 0:

pass

else:

# Check if the current character's index is in the list of chosen indices.

if index in positions:

# Insert a random character in front of the chosen character.

word_insert = random.choice(special_symbols)

sen_init.append(word_insert)

# Convert the modified list back to a string.

sen_init = ''.join(sen_init)

return sen_init

def word_split(self):

"""

Randomly split a word to two tokens randomly

"""

# Convert the sentence to a list of words for easier manipulation

sen_list = self.words.copy()

# Create a list of positions that correspond to non-stop words

positions = [i for i, word in enumerate(sen_list) if word.lower() not in self.stop_words]

# Randomly sample `self.num` positions from all positions

positions = random.sample(positions, self.num)

# Initialize an empty list to store the modified sentence

sen_init = []

for index, word in enumerate(self.words):

if index in positions:

if len(word) > 1:

# Split the word into two pieces

indice = random.randint(1, len(word) - 1)

# Get the left and right subwords

left_subword = word[:indice]

right_subword = word[indice:]

# Append the left and right subwords

sen_init.append(left_subword)

sen_init.append(" ")

sen_init.append(right_subword)

else:

pass

else:

sen_init.append(word)

# Convert the modified list back to a string.

sen_init = ''.join(sen_init)

return sen_init

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

word.py

word.py

Files

word.py

Latest commit

History

word.py

File metadata and controls