From 847a81a09fcb5b811b7bc6643e278c41b3d329c6 Mon Sep 17 00:00:00 2001
From: Richard Wu <richardwu1997@gmail.com>
Date: Sat, 16 Feb 2019 18:31:40 -0500
Subject: [PATCH 1/3] Store domain as a Postgres array type and keep as list in
 memory. Remove domain_size column.

---
 dataset/dataset.py                     |  4 ++--
 domain/domain.py                       | 14 +++++---------
 domain/estimators/logistic.py          | 10 +++++-----
 domain/estimators/naive_bayes.py       |  2 +-
 repair/featurize/featurized_dataset.py |  2 +-
 repair/featurize/freqfeat.py           |  4 ++--
 repair/featurize/initsimfeat.py        |  4 ++--
 repair/featurize/langmodelfeat.py      |  4 ++--
 repair/featurize/occurattrfeat.py      |  2 +-
 9 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index ee1c5ae4e..3c1ac9ccc 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -267,7 +267,7 @@ def get_domain_info(self):
         """
         Returns (number of random variables, count of distinct values across all attributes).
         """
-        query = 'SELECT count(_vid_), max(domain_size) FROM %s'%AuxTables.cell_domain.name
+        query = 'SELECT COUNT(_vid_), MAX(LENGTH(domain)) FROM %s'%AuxTables.cell_domain.name
         res = self.engine.execute_query(query)
         total_vars = int(res[0][0])
         classes = int(res[0][1])
@@ -279,7 +279,7 @@ def get_inferred_values(self):
         query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \
                 "FROM " \
                 "(SELECT _tid_, attribute, " \
-                "_vid_, init_value, string_to_array(regexp_replace(domain, \'[{\"\"}]\', \'\', \'gi\'), \'|||\') as domain " \
+                "_vid_, init_value, domain" \
                 "FROM %s) as t1, %s as t2 " \
                 "WHERE t1._vid_ = t2._vid_"%(AuxTables.cell_domain.name, AuxTables.inf_values_idx.name)
         self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_'])
diff --git a/domain/domain.py b/domain/domain.py
index 4dbce5e93..c8afa2ad1 100644
--- a/domain/domain.py
+++ b/domain/domain.py
@@ -124,7 +124,7 @@ def store_domains(self, domain):
             self.ds.generate_aux_table(AuxTables.cell_domain, domain, store=True, index_attrs=['_vid_'])
             self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_tid_'])
             self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_cid_'])
-            query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(string_to_array(regexp_replace(domain,\'[{\"\"}]\',\'\',\'gi\'),\'|||\')) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name
+            query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(domain::TEXT[]) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name
             self.ds.generate_aux_table_sql(AuxTables.pos_values, query, index_attrs=['_tid_', 'attribute'])
 
     def setup_attributes(self):
@@ -219,8 +219,7 @@ def generate_domain(self):
             _cid_: cell ID (unique for every entity-attribute)
             _vid_: variable ID (1-1 correspondence with _cid_)
             attribute: attribute name
-            domain: ||| separated string of domain values
-            domain_size: length of domain
+            domain: array of domain values
             init_value: initial value for this cell
             init_value_idx: domain index of init_value
             fixed: 1 if a random sample was taken since no correlated attributes/top K values
@@ -252,8 +251,7 @@ def generate_domain(self):
                                 "attribute": attr,
                                 "_cid_": cid,
                                 "_vid_": vid,
-                                "domain": "|||".join(dom),
-                                "domain_size": len(dom),
+                                "domain": dom,
                                 "init_value": init_value,
                                 "init_index": init_value_idx,
                                 "weak_label": weak_label,
@@ -270,8 +268,7 @@ def generate_domain(self):
                                     "attribute": attr,
                                     "_cid_": cid,
                                     "_vid_": vid,
-                                    "domain": "|||".join(dom),
-                                    "domain_size": len(dom),
+                                    "domain": dom,
                                     "init_value": init_value,
                                     "init_index": init_value_idx,
                                     "weak_label": init_value,
@@ -323,8 +320,7 @@ def generate_domain(self):
                 domain_values.append(row['init_value'])
             domain_values = sorted(domain_values)
             # update our memoized domain values for this row again
-            row['domain'] = '|||'.join(domain_values)
-            row['domain_size'] = len(domain_values)
+            row['domain'] = domain_values
             row['weak_label_idx'] = domain_values.index(row['weak_label'])
             row['init_index'] = domain_values.index(row['init_value'])
 
diff --git a/domain/estimators/logistic.py b/domain/estimators/logistic.py
index ca247a7b4..abd05f7e6 100644
--- a/domain/estimators/logistic.py
+++ b/domain/estimators/logistic.py
@@ -25,7 +25,7 @@ def __init__(self, env, dataset, domain_df, active_attrs, batch_size=32):
         """
         :param dataset: (Dataset) original dataset
         :param domain_df: (DataFrame) currently populated domain dataframe.
-            Required columns are: _vid_, _tid_, attribute, domain, domain_size, init_value
+            Required columns are: _vid_, _tid_, attribute, domain, init_value
         :param active_attrs: (list[str]) attributes that have random values
         """
         torch.nn.Module.__init__(self)
@@ -40,7 +40,7 @@ def __init__(self, env, dataset, domain_df, active_attrs, batch_size=32):
         # self.dom maps tid --> attr --> list of domain values
         # we need to find the number of domain values we will be generating
         # a training sample for.
-        self.n_samples = int(domain_df['domain_size'].sum())
+        self.n_samples = int(domain_df['domain'].apply(len).sum())
 
         # Create and initialize featurizers.
         self.featurizers = [CooccurAttrFeaturizer(self.ds)]
@@ -86,7 +86,7 @@ def _gen_training_data(self):
         self.vid_to_idxs = {}
         for rec in tqdm(list(self.domain_records)):
             init_row = raw_data_dict[rec['_tid_']]
-            domain_vals = rec['domain'].split('|||')
+            domain_vals = rec['domain']
 
             # Generate the feature tensor for all the domain values for this
             # cell.
@@ -166,7 +166,7 @@ def predict_pp(self, row, attr=None, values=None):
         start_idx, end_idx = self.vid_to_idxs[row['_vid_']]
         pred_X = self._X[start_idx:end_idx]
         pred_Y = self.forward(pred_X)
-        values = self.domain_records[row['_vid_']]['domain'].split('|||')
+        values = self.domain_records[row['_vid_']]['domain']
         return zip(values, map(float, pred_Y))
 
     def predict_pp_batch(self, raw_records_by_tid=None, cell_domain_rows=None):
@@ -179,7 +179,7 @@ def predict_pp_batch(self, raw_records_by_tid=None, cell_domain_rows=None):
         """
         pred_Y = self.forward(self._X)
         for rec in self.domain_records:
-            values = rec['domain'].split('|||')
+            values = rec['domain']
             start_idx, end_idx = self.vid_to_idxs[rec['_vid_']]
             yield zip(values, map(float, pred_Y[start_idx:end_idx]))
 
diff --git a/domain/estimators/naive_bayes.py b/domain/estimators/naive_bayes.py
index c3235dfa4..21c8ff2cf 100644
--- a/domain/estimators/naive_bayes.py
+++ b/domain/estimators/naive_bayes.py
@@ -66,7 +66,7 @@ def predict_pp_batch(self):
         val is the domain value and proba is the estimator's posterior probability estimate.
         """
         for row in tqdm(self.domain_df.to_records()):
-            yield self.predict_pp(self._raw_records_by_tid[row['_tid_']], row['attribute'], row['domain'].split('|||'))
+            yield self.predict_pp(self._raw_records_by_tid[row['_tid_']], row['attribute'], row['domain'])
 
     def _get_corr_attributes(self, attr):
         """
diff --git a/repair/featurize/featurized_dataset.py b/repair/featurize/featurized_dataset.py
index 2ce666bd2..724aa33fe 100644
--- a/repair/featurize/featurized_dataset.py
+++ b/repair/featurize/featurized_dataset.py
@@ -101,7 +101,7 @@ def generate_var_mask(self):
             is valid for the i-th VID and tensor[i][j] = -10e6 otherwise.
         """
         var_to_domsize = {}
-        query = 'SELECT _vid_, domain_size FROM %s' % AuxTables.cell_domain.name
+        query = 'SELECT _vid_, LENGTH(domain) FROM %s' % AuxTables.cell_domain.name
         res = self.ds.engine.execute_query(query)
         mask = torch.zeros(self.total_vars,self.classes)
         for tuple in tqdm(res):
diff --git a/repair/featurize/freqfeat.py b/repair/featurize/freqfeat.py
index 1e9656560..1c1f5e7c6 100644
--- a/repair/featurize/freqfeat.py
+++ b/repair/featurize/freqfeat.py
@@ -16,7 +16,7 @@ def specific_setup(self):
     def gen_feat_tensor(self, input, classes):
         vid = int(input[0])
         attribute = input[1]
-        domain = input[2].split('|||')
+        domain = input[2]
         attr_idx = self.ds.attr_to_idx[attribute]
         tensor = torch.zeros(1, classes, self.attrs_number)
         for idx, val in enumerate(domain):
@@ -25,7 +25,7 @@ def gen_feat_tensor(self, input, classes):
         return tensor
 
     def create_tensor(self):
-        query = 'SELECT _vid_, attribute, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
+        query = 'SELECT _vid_, attribute, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
         results = self.ds.engine.execute_query(query)
         tensors = [self.gen_feat_tensor(res, self.classes) for res in results]
         combined = torch.cat(tensors)
diff --git a/repair/featurize/initsimfeat.py b/repair/featurize/initsimfeat.py
index 364e0f471..b4d6f4bd9 100644
--- a/repair/featurize/initsimfeat.py
+++ b/repair/featurize/initsimfeat.py
@@ -13,7 +13,7 @@ def gen_feat_tensor(input, classes, total_attrs):
     init_value = input[2]
     # TODO: To add more similarity metrics increase the last dimension of tensor.
     tensor = torch.zeros(1, classes, total_attrs)
-    domain = input[3].split('|||')
+    domain = input[3]
     for idx, val in enumerate(domain):
         if val == init_value:
             sim = -1.0
@@ -53,7 +53,7 @@ def specific_setup(self):
                 raise ValueError("The size of init_weight for InitSimFeaturizer %d does not match the number of attributes %d." % (self.init_weight.shape[0], len(self.all_attrs)))
 
     def create_tensor(self):
-        query = 'SELECT _vid_, attribute, init_value, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
+        query = 'SELECT _vid_, attribute, init_value, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
         results = self.ds.engine.execute_query(query)
         map_input = []
         for res in results:
diff --git a/repair/featurize/langmodelfeat.py b/repair/featurize/langmodelfeat.py
index 7332daf18..fac4c9a97 100644
--- a/repair/featurize/langmodelfeat.py
+++ b/repair/featurize/langmodelfeat.py
@@ -21,7 +21,7 @@ def specific_setup(self):
     def gen_feat_tensor(self, input, classes):
         vid = int(input[0])
         attribute = input[1]
-        domain = input[2].split('|||')
+        domain = input[2]
         attr_idx = self.ds.attr_to_idx[attribute]
         model = self.attr_language_model[attribute]
         tensor = torch.zeros(1, classes, self.attrs_number*self.emb_size)
@@ -33,7 +33,7 @@ def gen_feat_tensor(self, input, classes):
         return tensor
 
     def create_tensor(self):
-        query = 'SELECT _vid_, attribute, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
+        query = 'SELECT _vid_, attribute, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name
         results = self.ds.engine.execute_query(query)
         tensors = [self.gen_feat_tensor(res, self.classes) for res in results]
         combined = torch.cat(tensors)
diff --git a/repair/featurize/occurattrfeat.py b/repair/featurize/occurattrfeat.py
index 35f02cfac..e08b0e151 100644
--- a/repair/featurize/occurattrfeat.py
+++ b/repair/featurize/occurattrfeat.py
@@ -47,7 +47,7 @@ def create_tensor(self):
     def gen_feat_tensor(self, row, tuple):
         tensor = torch.zeros(1, self.classes, self.attrs_number*self.attrs_number)
         rv_attr = row['attribute']
-        domain = row['domain'].split('|||')
+        domain = row['domain']
         rv_domain_idx = {val: idx for idx, val in enumerate(domain)}
         rv_attr_idx = self.ds.attr_to_idx[rv_attr]
         for attr in self.all_attrs:

From 9a65495c7bdce19870cf0020240fcf677f16e375 Mon Sep 17 00:00:00 2001
From: Richard Wu <richardwu1997@gmail.com>
Date: Mon, 4 Mar 2019 18:21:03 -0500
Subject: [PATCH 2/3] Missing TEXT[] cast in SQL query

---
 dataset/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index 3c1ac9ccc..b57c7d9c2 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -279,7 +279,7 @@ def get_inferred_values(self):
         query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \
                 "FROM " \
                 "(SELECT _tid_, attribute, " \
-                "_vid_, init_value, domain" \
+                "_vid_, init_value, domain::TEXT[] " \
                 "FROM %s) as t1, %s as t2 " \
                 "WHERE t1._vid_ = t2._vid_"%(AuxTables.cell_domain.name, AuxTables.inf_values_idx.name)
         self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_'])

From bd48158dbb2f3f323df520cc524df2e19b1a6913 Mon Sep 17 00:00:00 2001
From: Richard Wu <richardwu1997@gmail.com>
Date: Mon, 4 Mar 2019 18:21:15 -0500
Subject: [PATCH 3/3] temp for MET dataset

---
 examples/holoclean_repair_example.py | 13 ++++++-------
 testdata/met_constraints.txt         | 22 ++++++++++------------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/examples/holoclean_repair_example.py b/examples/holoclean_repair_example.py
index d9c78d0ba..60c8afc62 100644
--- a/examples/holoclean_repair_example.py
+++ b/examples/holoclean_repair_example.py
@@ -27,8 +27,8 @@
 ).session
 
 # 2. Load training data and denial constraints.
-hc.load_data('hospital', '../testdata/hospital.csv')
-hc.load_dcs('../testdata/hospital_constraints.txt')
+hc.load_data('hospital', '../testdata/met_1000_transformed.csv')
+hc.load_dcs('../testdata/met_constraints.txt')
 hc.ds.set_constraints(hc.get_dcs())
 
 # 3. Detect erroneous cells using these two detectors.
@@ -42,13 +42,12 @@
     OccurAttrFeaturizer(),
     FreqFeaturizer(),
     ConstraintFeaturizer(),
-    LangModelFeaturizer(),
 ]
 
 hc.repair_errors(featurizers)
 
 # 5. Evaluate the correctness of the results.
-hc.evaluate(fpath='../testdata/hospital_clean.csv',
-            tid_col='tid',
-            attr_col='attribute',
-            val_col='correct_val')
+# hc.evaluate(fpath='../testdata/hospital_clean.csv',
+#             tid_col='tid',
+#             attr_col='attribute',
+#             val_col='correct_val')
diff --git a/testdata/met_constraints.txt b/testdata/met_constraints.txt
index 4d0160ffb..ab02ddd60 100644
--- a/testdata/met_constraints.txt
+++ b/testdata/met_constraints.txt
@@ -1,14 +1,12 @@
-t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.City,t2.City)
-t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.State,t2.State)
-t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.Country,t2.Country)
-t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.City,t2.City)
-t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.State,t2.State)
-t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.Country,t2.Country)
-t1&t2&EQ(t1.River,t2.River)&IQ(t1.Country,t2.Country)
-t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.City,t2.City)
-t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.State,t2.State)
-t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Country,t2.Country)
+# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.City,t2.City)
+# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.State,t2.State)
+# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.Country,t2.Country)
+# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.City,t2.City)
+# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.State,t2.State)
+# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.Country,t2.Country)
+# t1&t2&EQ(t1.River,t2.River)&IQ(t1.Country,t2.Country)
+# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.City,t2.City)
+# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.State,t2.State)
+# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Country,t2.Country)
 t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Dynasty,t2.Dynasty)
 t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Period,t2.Period)
-
-