From b844df6c4a4dae26d786a0de7e28fcdf1c92a767 Mon Sep 17 00:00:00 2001 From: ty5491003 Date: Tue, 17 Nov 2020 09:50:12 +0800 Subject: [PATCH] Fix bugs. --- README | 2 +- generate.py | 5 +++-- pre.py | 39 --------------------------------------- prepare.py | 10 +++++----- 4 files changed, 9 insertions(+), 47 deletions(-) delete mode 100644 pre.py diff --git a/README b/README index 4e14c73..1a1d937 100644 --- a/README +++ b/README @@ -1,6 +1,6 @@ ## DeepFuzz * run.sh is used to crawl training data from gcc. -* pre.py is used to prepare training data. +* prepare.py is used to prepare training data. * train.py is used to train a s2s model. * generate.py is used to generate new programs with trained model. \ No newline at end of file diff --git a/generate.py b/generate.py index a1026f9..02a970b 100644 --- a/generate.py +++ b/generate.py @@ -8,6 +8,7 @@ import pdb import os import preprocess as pp +import random latent_dim = 512 # Latent dimensionality of the encoding space. num_samples = 2000000 # Number of samples to train on. @@ -53,7 +54,7 @@ input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict( - [(char, i) for i, char in enumerate(target_characqaters)]) + [(char, i) for i, char in enumerate(target_characters)]) encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_length, num_encoder_tokens), @@ -296,7 +297,7 @@ def generate(): continue total_count += 1 text = synthesis(text, 'g1', 'nosample') - is_valid = verify_correctness(text, file, 'deepfuzz_g1_nosample') + is_valid = pp.verify_correctness(text, file, 'deepfuzz_g1_nosample') if (is_valid): syntax_valid_count += 1 except: diff --git a/pre.py b/pre.py deleted file mode 100644 index c8b7916..0000000 --- a/pre.py +++ /dev/null @@ -1,39 +0,0 @@ -import re -import glob -from subprocess import Popen, PIPE, STDOUT -from os import walk -import os -import preprocess as pp - -def generate_training_data(text): - maxlen = 50 - sentences = [] - next_chars = [] - for i in range(0, len(text) - maxlen - 1): - sentences.append(text[i: i + maxlen]) - next_chars.append(text[i+1 : i + maxlen]) - sentences[i] = re.sub(r'[\n\t]',' ', sentences[i]) - next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i]) - print(sentences[i] + "\t" + next_chars[i]) - -path = 'testsuite' -files = [] -valid_count = 0 -for root, d_names, f_names in os.walk(path): - for f in f_names: - files.append(os.path.join(root, f)) -for file in files: - print ('--------------------------------------------------') - print (file) - if ('nocomment' in file or 'nospace' in file or 'nomacro' in file or 'raw' in file): - command = 'rm ' + file - p = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) - text = open(file, 'r').read() - text = remove_comment(text) - text = replace_macro(text, file) - text = remove_space(text) - is_valid = verify_correctness(text, file, 'nospace') - if (is_valid): - valid_count += 1 - generate_training_data(text) -print(valid_count) diff --git a/prepare.py b/prepare.py index a4db770..9684fab 100644 --- a/prepare.py +++ b/prepare.py @@ -12,11 +12,11 @@ def generate_training_data(text): sentences = [] next_chars = [] for i in range(0, len(text) - maxlen - 1): - sentences.append(text[i: i + maxlen]) - next_chars.append(text[i + maxlen]) - sentences[i] = re.sub(r'[\n\t]',' ', sentences[i]) - next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i]) - print(sentences[i] + "\t" + next_chars[i]) + sentences.append(text[i: i + maxlen]) + next_chars.append(text[i + maxlen]) + sentences[i] = re.sub(r'[\n\t]',' ', sentences[i]) + next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i]) + print(sentences[i] + "\t" + next_chars[i], file=open('pair', 'a+', encoding='iso-8859-1')) path = './gcc/gcc/testsuite' files = []