1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
| import pickle import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle import numpy as np import random import nltk from nltk.corpus import wordnet from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4')
def plot_graphs(history, metric): plt.plot(history.history[metric]) plt.plot(history.history['val_'+metric], '') plt.xlabel("Epochs") plt.ylabel(metric) plt.legend([metric, 'val_'+metric])
def eda_SR(originalSentence, n): """ Paper Methodology -> Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random. originalSentence -> The sentence on which EDA is to be applied n -> The number of words to be chosen for random synonym replacement """ stops = set(stopwords.words('english')) splitSentence = list(originalSentence.split(" ")) splitSentenceCopy = splitSentence.copy() ls_nonStopWordIndexes = [] for i in range(len(splitSentence)): if splitSentence[i].lower() not in stops: ls_nonStopWordIndexes.append(i) if (n > len(ls_nonStopWordIndexes)): raise Exception( "The number of replacements exceeds the number of non stop word words") for i in range(n): indexChosen = random.choice(ls_nonStopWordIndexes) ls_nonStopWordIndexes.remove(indexChosen) synonyms = [] originalWord = splitSentenceCopy[indexChosen] for synset in wordnet.synsets(originalWord): for lemma in synset.lemmas(): if lemma.name() != originalWord: synonyms.append(lemma.name()) if (synonyms == []): continue splitSentence[indexChosen] = random.choice(synonyms).replace('_', ' ') return " ".join(splitSentence)
def eda_RI(originalSentence, n): """ Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times originalSentence -> The sentence on which EDA is to be applied n -> The number of times the process has to be repeated """ stops = set(stopwords.words('english')) splitSentence = list(originalSentence.split(" ")) splitSentenceCopy = splitSentence.copy() ls_nonStopWordIndexes = [] for i in range(len(splitSentence)): if splitSentence[i].lower() not in stops: ls_nonStopWordIndexes.append(i) if (n > len(ls_nonStopWordIndexes)): raise Exception("The number of replacements exceeds the number of non stop word words") WordCount = len(splitSentence) for i in range(n): indexChosen = random.choice(ls_nonStopWordIndexes) ls_nonStopWordIndexes.remove(indexChosen) synonyms = [] originalWord = splitSentenceCopy[indexChosen] for synset in wordnet.synsets(originalWord): for lemma in synset.lemmas(): if lemma.name() != originalWord: synonyms.append(lemma.name()) if (synonyms == []): continue splitSentence.insert(random.randint(0,WordCount-1), random.choice(synonyms).replace('_', ' ')) return " ".join(splitSentence)
def eda_RS(originalSentence, n): """ Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times originalSentence -> The sentence on which EDA is to be applied n -> The number of times the process has to be repeated """ splitSentence = list(originalSentence.split(" ")) WordCount = len(splitSentence) for i in range(n): firstIndex = random.randint(0,WordCount-1) secondIndex = random.randint(0,WordCount-1) while (secondIndex == firstIndex and WordCount != 1): secondIndex = random.randint(0,WordCount-1) splitSentence[firstIndex], splitSentence[secondIndex] = splitSentence[secondIndex], splitSentence[firstIndex] return " ".join(splitSentence)
def eda_RD(originalSentence, p): """ Paper Methodology -> Randomly remove each word in the sentence with probability p. originalSentence -> The sentence on which EDA is to be applied p -> Probability of a Word Being Removed """ og = originalSentence if (p == 1): raise Exception("Always an Empty String Will Be Returned") if (p > 1 or p < 0): raise Exception("Improper Probability Value") splitSentence = list(originalSentence.split(" ")) lsIndexesRemoved = [] WordCount = len(splitSentence) for i in range(WordCount): randomDraw = random.random() if randomDraw <= p: lsIndexesRemoved.append(i) lsRetainingWords = [] for i in range(len(splitSentence)): if i not in lsIndexesRemoved: lsRetainingWords.append(splitSentence[i]) if (lsRetainingWords == []): return og return " ".join(lsRetainingWords)
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
words = list(sentence.split()) num_words = len(words)
augmented_sentences = [] num_new_per_technique = int(num_aug / 4) + 1 n_sr = max(1, int(alpha_sr * num_words)) n_ri = max(1, int(alpha_ri * num_words)) n_rs = max(1, int(alpha_rs * num_words))
for _ in range(num_new_per_technique): a_sentence = eda_SR(sentence, n_sr) augmented_sentences.append(a_sentence)
for _ in range(num_new_per_technique): a_sentence = eda_RI(sentence, n_ri) augmented_sentences.append(a_sentence)
for _ in range(num_new_per_technique): a_sentence = eda_RS(sentence, n_rs) augmented_sentences.append(a_sentence)
for _ in range(num_new_per_technique): a_sentence = eda_RD(sentence, p_rd) augmented_sentences.append(a_sentence)
shuffle(augmented_sentences)
if num_aug >= 1: augmented_sentences = augmented_sentences[:num_aug] else: keep_prob = num_aug / len(augmented_sentences) augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
augmented_sentences.append(sentence)
return augmented_sentences
def get_eda_df(sentences, alpha=0.1, num_avg=9): results = [] for i, sents in enumerate(sentences): augmented_sentences = eda(sents, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_avg) results.append(augmented_sentences) return sum(results, [])
if __name__ == '__main__':
seg_list = "microcontroller coprocessor bmc boot up device enabled asserted microcontroller bmc boot up device enabled asserted processor cpu1 status presence detected asserted processor cpu0 status presence detected asserted system acpi power state acpi pwr status ss one state soft off asserted button button pressed power button pressed asserted system acpi power state acpi pwr status sg one state working asserted power supply ps1 status presence detected asserted power supply ps2 status presence detected asserted" augmented_sentences = eda(seg_list, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.05, num_aug=9) print(len(augmented_sentences )) print(augmented_sentences)
|