From b844df6c4a4dae26d786a0de7e28fcdf1c92a767 Mon Sep 17 00:00:00 2001
From: ty5491003 <nwu_ty@163.com>
Date: Tue, 17 Nov 2020 09:50:12 +0800
Subject: [PATCH] Fix bugs.

---
 README      |  2 +-
 generate.py |  5 +++--
 pre.py      | 39 ---------------------------------------
 prepare.py  | 10 +++++-----
 4 files changed, 9 insertions(+), 47 deletions(-)
 delete mode 100644 pre.py

diff --git a/README b/README
index 4e14c73..1a1d937 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
 ## DeepFuzz
 
 * run.sh is used to crawl training data from gcc.
-* pre.py is used to prepare training data.
+* prepare.py is used to prepare training data.
 * train.py is used to train a s2s model.
 * generate.py is used to generate new programs with trained model.
\ No newline at end of file
diff --git a/generate.py b/generate.py
index a1026f9..02a970b 100644
--- a/generate.py
+++ b/generate.py
@@ -8,6 +8,7 @@
 import pdb
 import os
 import preprocess as pp
+import random
 
 latent_dim = 512  # Latent dimensionality of the encoding space.
 num_samples = 2000000  # Number of samples to train on.
@@ -53,7 +54,7 @@
 input_token_index = dict(
     [(char, i) for i, char in enumerate(input_characters)])
 target_token_index = dict(
-    [(char, i) for i, char in enumerate(target_characqaters)])
+    [(char, i) for i, char in enumerate(target_characters)])
 
 encoder_input_data = np.zeros(
     (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
@@ -296,7 +297,7 @@ def generate():
                 continue
             total_count += 1
             text = synthesis(text, 'g1', 'nosample')
-            is_valid = verify_correctness(text, file, 'deepfuzz_g1_nosample')
+            is_valid = pp.verify_correctness(text, file, 'deepfuzz_g1_nosample')
             if (is_valid):
                 syntax_valid_count += 1
         except:
diff --git a/pre.py b/pre.py
deleted file mode 100644
index c8b7916..0000000
--- a/pre.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import re
-import glob
-from subprocess import Popen, PIPE, STDOUT
-from os import walk
-import os
-import preprocess as pp
-
-def generate_training_data(text):
-	maxlen = 50
-	sentences = []
-	next_chars = []
-	for i in range(0, len(text) - maxlen - 1):
-	    sentences.append(text[i: i + maxlen])
-	    next_chars.append(text[i+1 : i + maxlen])
-	    sentences[i] = re.sub(r'[\n\t]',' ', sentences[i])
-	    next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i])
-	    print(sentences[i] + "\t" + next_chars[i])
-
-path = 'testsuite'
-files = []
-valid_count = 0
-for root, d_names, f_names in os.walk(path):
-	for f in f_names:
-		files.append(os.path.join(root, f))
-for file in files:
-	print ('--------------------------------------------------')
-	print (file)
-	if ('nocomment' in file or 'nospace' in file or 'nomacro' in file or 'raw' in file):
-		command = 'rm ' + file
-		p = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
-	text = open(file, 'r').read()
-	text = remove_comment(text)
-	text = replace_macro(text, file)
-	text = remove_space(text)
-	is_valid = verify_correctness(text, file, 'nospace')
-	if (is_valid):
-		valid_count += 1
-		generate_training_data(text)	
-print(valid_count)
diff --git a/prepare.py b/prepare.py
index a4db770..9684fab 100644
--- a/prepare.py
+++ b/prepare.py
@@ -12,11 +12,11 @@ def generate_training_data(text):
 	sentences = []
 	next_chars = []
 	for i in range(0, len(text) - maxlen - 1):
-	    sentences.append(text[i: i + maxlen])
-	    next_chars.append(text[i + maxlen])
-	    sentences[i] = re.sub(r'[\n\t]',' ', sentences[i])
-	    next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i])
-	    print(sentences[i] + "\t" + next_chars[i])
+		sentences.append(text[i: i + maxlen])
+		next_chars.append(text[i + maxlen])
+		sentences[i] = re.sub(r'[\n\t]',' ', sentences[i])
+		next_chars[i] = re.sub(r'[\n\t]',' ', next_chars[i])
+		print(sentences[i] + "\t" + next_chars[i], file=open('pair', 'a+', encoding='iso-8859-1'))
 
 path = './gcc/gcc/testsuite'
 files = []