Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
02f8a4f
Relocated test data into subdirectories.
richardwu Apr 5, 2019
99c45f6
Move active attributes to right after error detection and inside
richardwu Apr 5, 2019
da83103
Remove 2.7 tests from Travis because of Python 3 features.
richardwu Apr 5, 2019
a340cac
Refactor domain generation sort domain by co-occurrence probability and
richardwu Apr 5, 2019
96080c1
Make co-occurrence featurizer only generate co-occurrence features for
richardwu Apr 5, 2019
673a779
Implemented TupleEmbedding model as an estimator.
richardwu Apr 5, 2019
78849ed
Always load clean/ground truth as strings since we load/store raw dat…
richardwu Apr 5, 2019
44afd2d
Added featurizer for learned embeddings from TupleEmbedding model.
richardwu Apr 5, 2019
a6e5453
Support multiple layers during repair and made TupleEmbedding dump/load
richardwu Apr 7, 2019
8b2e076
Quantization and handling of numerical/mixed data.
zaqthss Apr 3, 2019
116e656
Improved validation logging and fixed a few bugs.
richardwu Apr 8, 2019
e2bc74c
Improve validation in TupleEmbedding using pandas dataframes.
richardwu Apr 9, 2019
ce2dc33
Suppose multi-dimensional quantization.
richardwu Apr 9, 2019
33be8a4
Quantize from dict rather than numerical attrs.
richardwu Apr 18, 2019
67547f5
Mean/var normalize numerical attributes in context and added
richardwu Apr 19, 2019
61b4147
Support specifying n-dimensional numerical attr groups vs splitting on
richardwu Apr 20, 2019
cd0fd9a
Fixed None numerical_attr_groups.
richardwu Apr 20, 2019
677e602
Fixed report RMS error and converting to floats for quantization.
richardwu Apr 20, 2019
22950d4
Added store_to_fb flag to load_data, added LR schedule to
richardwu Apr 28, 2019
873a779
Pre-split domain and ground truth values.
richardwu Apr 30, 2019
2ba3116
Fixed batch size argument in EmbeddingFeaturizer.
richardwu May 1, 2019
6bc733e
Removed numerical_attrs reference from Table.
richardwu May 1, 2019
d902be8
Fix to how multi-ground truth is handled. Use simplified numerical
richardwu May 5, 2019
4a43c6a
Max domain size need only be as large as largest for categorical
richardwu May 7, 2019
123845d
Remove domain for numerical attributes in TupleEmbedding.
richardwu May 7, 2019
186099e
Fixed some reference issues and added infer all mode.
richardwu May 17, 2019
71c0e37
Fixed _nan_ replacement, max_cat_domain being possibly nan, and
richardwu May 20, 2019
4c5cd30
Do not weak label clean cells and fixed raw data in Logistic estimator.
richardwu May 25, 2019
bc3a3ed
Added ReLU after context for numerical targets in TupleEmbedding and
richardwu May 28, 2019
69ba100
Use cosine annealing with restart LR schedule and use weak_label instead
richardwu Jun 2, 2019
120ab08
Fixed memory issues with get_features and predict_pp_batch.
richardwu Jun 4, 2019
36e4fff
Fixed bug in get_features.
richardwu Jun 5, 2019
4dca0e9
Added comment to EmbeddingFeat.
richardwu Jun 5, 2019
3792df0
Finally fixed memory issues with torch.no_grad.
richardwu Jun 8, 2019
d22c91a
ConstraintFeaturizer runs on un-quantized values.
richardwu Jun 10, 2019
b13ccb2
Do not drop single value cells (for evaluation).
richardwu Jun 10, 2019
f9c14f1
Do not generate queries/feature for DC that does not pertain to
richardwu Jun 10, 2019
dc7433a
Fixed ConstraintFeaturizer to handle no DCs.
richardwu Jun 10, 2019
1d9b92c
Removed deprecated code and added dropout.
richardwu Jun 11, 2019
c0d7f0b
Fixed calculation of num_batches in learning loop.
richardwu Jun 12, 2019
aa26704
do not drop null inits cells with dom(len) <= 1
zaqthss Jun 17, 2019
cbd7dca
Fixed z-scoring with 0 std and deleting e-notation numerical values.
richardwu Jun 19, 2019
aea74a5
Do not quantize if bins > unique.
richardwu Jun 20, 2019
05f04bf
Fixed some things in domain.
richardwu Jun 22, 2019
6e53187
Added repair w/ validation set and removed multiple correct values in
richardwu Jul 4, 2019
ad0bd4c
Fixed domain generation to include single value cells in domain.
richardwu Jul 19, 2019
59b4e3f
Handle untrained context values properly and added code for domain
richardwu Jul 25, 2019
ddbf829
Regression fix for moving raw_data_dict before z-normalization and
richardwu Jul 27, 2019
1e85bbd
Merge branch 'dev' into hcq-embedding-3
minafarid Sep 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 9 additions & 17 deletions domain/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def get_corr_attributes(self, attr, thres):
attr_correlations = self.correlations[attr]
return sorted([corr_attr
for corr_attr, corr_strength in attr_correlations.items()
if corr_attr != attr and corr_strength > thres])
if corr_attr != attr and corr_strength >= thres])

def generate_domain(self):
"""
Expand Down Expand Up @@ -213,23 +213,15 @@ def generate_domain(self):
# This would be a "SINGLE_VALUE" example and we'd still
# like to generate a random domain for it.
if init_value == NULL_REPR and len(dom) == 0:
continue
continue

# Not enough domain values, we need to get some random
# values (other than 'init_value') for training. However,
# this might still get us zero domain values.
rand_dom_values = self.get_random_domain(attr, init_value)

# rand_dom_values might still be empty. In this case,
# there are no other possible values for this cell. There
# is not point to use this cell for training and there is no
# point to run inference on it since we cannot even generate
# a random domain. Therefore, we just ignore it from the
# final tensor.
# We do not drop NULL cells since we stil have to repair them
# with their 1 domain value.
if init_value != NULL_REPR and len(rand_dom_values) == 0:
continue
rand_dom_values = self.get_random_domain(attr, dom)

# We still want to add cells with only 1 single value and no
# additional random domain # they are required in the output.

# Otherwise, just add the random domain values to the domain
# and set the cell status accordingly.
Expand Down Expand Up @@ -334,16 +326,16 @@ def get_domain_cell(self, attr, row):

return init_value, init_value_idx, domain_lst

def get_random_domain(self, attr, cur_value):
def get_random_domain(self, attr, cur_dom):
"""
get_random_domain returns a random sample of at most size
'self.max_sample' of domain values for 'attr' that is NOT 'cur_value'.
'self.max_sample' of domain values for 'attr' that is NOT in 'cur_dom'.
"""
domain_pool = set(self.single_stats[attr].keys())
# We should not have any NULLs since we do not keep track of their
# counts.
assert NULL_REPR not in domain_pool
domain_pool.discard(cur_value)
domain_pool = domain_pool.difference(cur_dom)
domain_pool = sorted(list(domain_pool))
size = len(domain_pool)
if size > 0:
Expand Down
132 changes: 106 additions & 26 deletions domain/estimators/tuple_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def __init__(self, env, dataset, domain_df,
# Attributes to derive context from
self._init_cat_attrs, self._init_num_attrs = self._split_cat_num_attrs(self._all_attrs)
self._n_init_cat_attrs, self._n_init_num_attrs = len(self._init_cat_attrs), len(self._init_num_attrs)
self._n_init_attrs = len(self._all_attrs)

logging.debug('%s: init categorical attributes: %s',
type(self).__name__,
self._init_cat_attrs)
Expand All @@ -129,7 +131,15 @@ def __init__(self, env, dataset, domain_df,
self._train_num_attrs)

# Make copy of raw data
# Quantized data is used for co-occurrence statistics in the last layer
# for categorical targets.
self._raw_data = self.ds.get_raw_data().copy()
self._qtized_raw_data = self.ds.get_quantized_data() if self.ds.do_quantization else self._raw_data
self._qtized_raw_data_dict = self._qtized_raw_data.set_index('_tid_').to_dict('index')

# Statistics for cooccurrences.
_, self._single_stats, self._pair_stats = self.ds.get_statistics()

# Keep track of mean + std to un-normalize during prediction
self._num_attrs_mean = {}
self._num_attrs_std = {}
Expand All @@ -144,7 +154,13 @@ def __init__(self, env, dataset, domain_df,
/ (self._num_attrs_std[num_attr] or 1.)).astype(str)
self._raw_data[num_attr] = temp

# Indexes assigned to attributes: first categorical then numerical.
# This MUST go after the mean-0 variance 1 normalization above since
# this is looked up subsequently during training.
self._raw_data_dict = self._raw_data.set_index('_tid_').to_dict('index')

# Indexes assigned to attributes: FIRST categorical THEN numerical.
# (this order is important since we shift the numerical idxs).

self._init_attr_idxs = {attr: idx for idx, attr in enumerate(self._init_cat_attrs + self._init_num_attrs)}
self._train_attr_idxs = {attr: idx for idx, attr in enumerate(self._train_cat_attrs + self._train_num_attrs)}

Expand All @@ -155,6 +171,11 @@ def __init__(self, env, dataset, domain_df,
# Assign index for every unique value-attr (train/possible values, target)
self._train_val_idxs = {attr: {} for attr in self._train_cat_attrs}

# Initial categorical values we've seen during training. Otherwise
# we need to zero out the associated embedding since un-seen initial
# values will have garbage embeddings.
self._seen_init_cat_vals = {attr: set() for attr in self._init_cat_attrs}

# Reserve the 0th index as placeholder for padding in domain_idx and
# for NULL values.
cur_init_idx = 1
Expand Down Expand Up @@ -210,8 +231,6 @@ def __init__(self, env, dataset, domain_df,
self.n_init_vals = cur_init_idx
self.n_train_vals = cur_train_idx

self._raw_data_dict = self._raw_data.set_index('_tid_').to_dict('index')

self._vid_to_idx = {vid: idx for idx, vid in enumerate(domain_df['_vid_'].values)}
self._train_records = domain_df[['_vid_', '_tid_', 'attribute', 'init_value',
'init_index',
Expand All @@ -236,6 +255,8 @@ def _init_dummies(self):
dtype=torch.float)
self._dummy_domain_idxs = torch.zeros(self.max_cat_domain,
dtype=torch.long)
self._dummy_domain_cooccur = torch.zeros(self.max_cat_domain, self._n_init_attrs,
dtype=torch.float)
self._dummy_target_numvals = torch.zeros(self._max_num_dim,
dtype=torch.float)
self._dummy_cat_target = torch.LongTensor([-1])
Expand Down Expand Up @@ -298,6 +319,33 @@ def _get_domain_idxs(self, idx):

return self._domain_idxs[idx]

def _get_domain_cooccur_probs(self, idx):
"""
Returns co-occurrence probability for every domain value with every
initial context value (categorical and numerical (quantized)).

Returns (max_cat_domain, # of init attrs) tensor.
"""
cur = self._train_records[idx]

cooccur_probs = torch.zeros(self.max_cat_domain,
self._n_init_attrs,
dtype=torch.float)

# Compute co-occurrence statistics.
for attr_idx, attr in enumerate(self._all_attrs):
ctx_val = self._qtized_raw_data_dict[cur['_tid_']][attr]
if attr == cur['attribute'] or ctx_val == NULL_REPR or \
ctx_val not in self._pair_stats[attr][cur['attribute']]:
continue

denom = self._single_stats[attr][ctx_val]
for dom_idx, dom_val in enumerate(cur['domain']):
numer = self._pair_stats[attr][cur['attribute']][ctx_val].get(dom_val, 0.)
cooccur_probs[dom_idx,attr_idx] = numer / denom

return cooccur_probs

def _get_target_numvals(self, idx):
if not self.memoize or idx not in self._target_numvals:
cur = self._train_records[idx]
Expand Down Expand Up @@ -338,9 +386,22 @@ def _get_init_cat_idxs(self, idx):
if not self.memoize or idx not in self._init_cat_idxs:
cur = self._train_records[idx]

init_cat_idxs = torch.LongTensor([self._init_val_idxs[attr][self._raw_data_dict[cur['_tid_']][attr]]
if attr != cur['attribute'] else 0
for attr in self._init_cat_attrs])
init_cat_idxs = []
for attr in self._init_cat_attrs:
ctx_val = self._raw_data_dict[cur['_tid_']][attr]
# If the context attribute is the current target attribute
# we use the 0-vector.
# If we are in inference mode, we need to ensure we've seen
# the context value before, otherwise we assign the 0-vector.
if attr == cur['attribute'] or \
(self.inference_mode and \
ctx_val not in self._seen_init_cat_vals[attr]):
init_cat_idxs.append(0)
continue
self._seen_init_cat_vals[attr].add(ctx_val)
init_cat_idxs.append(self._init_val_idxs[attr][ctx_val])
init_cat_idxs = torch.LongTensor(init_cat_idxs)


if not self.memoize:
return init_cat_idxs
Expand Down Expand Up @@ -460,6 +521,8 @@ def __getitem__(self, vid):
# Categorical VID
if cur['attribute'] in self._train_cat_attrs:
domain_idxs, domain_mask, target = self._get_cat_domain_target(idx)
# TODO(richardwu): decide if we care about co-occurrence probabilities or not.
# domain_cooccur = self._get_domain_cooccur_probs(idx)
return vid, \
is_categorical, \
attr_idx, \
Expand Down Expand Up @@ -498,6 +561,9 @@ def _state_attrs(self):
return ['_vid_to_idx',
'_train_records',
'_raw_data_dict',
# '_qtized_raw_data_dict',
# '_single_stats',
# '_pair_stats',
'max_cat_domain',
'_max_num_dim',
'_init_val_idxs',
Expand Down Expand Up @@ -537,10 +603,11 @@ def __len__(self):
return len(self.iter)

class VidSampler(Sampler):
def __init__(self, domain_df, raw_df, numerical_attr_groups,
def __init__(self, domain_df, raw_df, num_attrs, numerical_attr_groups,
shuffle=True, train_only_clean=False):
# No NULL targets
domain_df = domain_df[domain_df['weak_label'] != NULL_REPR]
# No NULL categorical targets
domain_df = domain_df[domain_df['attribute'].isin(num_attrs) | (domain_df['weak_label'] != NULL_REPR)]


# No NULL values in each cell's numerical group (all must be non-null
# since target_numvals requires all numerical values.
Expand All @@ -557,7 +624,8 @@ def group_notnull(row):
return all(raw_data_dict[tid][attr] != NULL_REPR
for attr in attr_to_group[cur_attr])
fil_notnull = domain_df.apply(group_notnull, axis=1)
if sum(fil_notnull) < domain_df.shape[0]:

if domain_df.shape[0] and sum(fil_notnull) < domain_df.shape[0]:
logging.warning('dropping %d targets where target\'s numerical group contain NULLs',
domain_df.shape[0] - sum(fil_notnull))
domain_df = domain_df[fil_notnull]
Expand Down Expand Up @@ -646,7 +714,8 @@ def __init__(self, env, dataset, domain_df,
fil_numattr = self.domain_df['attribute'].isin(self._numerical_attrs)

# Memoize max domain size for numerical attribue for padding later.
self.max_domain = self.domain_df['domain_size'].max()
self.max_domain = int(self.domain_df['domain_size'].max())

self.domain_df.loc[fil_numattr, 'domain'] = ''
self.domain_df.loc[fil_numattr, 'domain_size'] = 0
# Remove categorical domain/training cells without a domain
Expand Down Expand Up @@ -691,7 +760,8 @@ def __init__(self, env, dataset, domain_df,

self._n_init_cat_attrs = self._dataset._n_init_cat_attrs
self._n_init_num_attrs = self._dataset._n_init_num_attrs
self._n_init_attrs = self._n_init_cat_attrs + self._n_init_num_attrs

self._n_init_attrs = self._dataset._n_init_attrs

self._n_train_cat_attrs = self._dataset._n_train_cat_attrs
self._n_train_num_attrs = self._dataset._n_train_num_attrs
Expand Down Expand Up @@ -756,6 +826,11 @@ def __init__(self, env, dataset, domain_df,
self.attr_W = torch.nn.Parameter(torch.zeros(self._n_train_attrs,
self._n_init_cat_attrs + self._n_num_attr_groups))

# Weights for 1) embedding score and 2) co-occurrence probabilities
# for categorical domain values.
self.cat_feat_W = torch.nn.Parameter(torch.zeros(self._n_train_attrs,
1 + self._n_init_attrs, 1))

# Initialize all but the first 0th vector embedding (reserved).
torch.nn.init.xavier_uniform_(self.in_W[1:])
torch.nn.init.xavier_uniform_(self.out_W[1:])
Expand All @@ -773,6 +848,7 @@ def __init__(self, env, dataset, domain_df,
torch.nn.init.xavier_uniform_(self.out_num_bias1)

torch.nn.init.xavier_uniform_(self.attr_W)
torch.nn.init.xavier_uniform_(self.cat_feat_W)

self._cat_loss = CrossEntropyLoss()
# TODO: we use MSE loss for all numerical attributes for now.
Expand Down Expand Up @@ -914,24 +990,30 @@ def _get_combined_init_vec(self, init_cat_idxs, init_numvals, init_nummasks, att
def _cat_forward(self, combined_init, domain_idxs, domain_masks):
"""
combined_init: (batch, embed size, 1)
cat_attr_idxs: (batch, 1)
domain_idxs: (batch, max domain)
domain_masks: (batch, max domain)

Returns logits: (batch, max domain)
"""
# (batch, max domain, embed size)
domain_vecs = self.out_W.index_select(0, domain_idxs.view(-1)).view(*domain_idxs.shape, self._embed_size)

# (batch, max domain, 1)
logits = domain_vecs.matmul(combined_init)

embed_prods = domain_vecs.matmul(combined_init)
# (batch, max domain, 1)
domain_biases = self.out_B.index_select(0, domain_idxs.view(-1)).view(*domain_idxs.shape, 1)

# (batch, max domain, 1)
logits.add_(domain_biases)
# (batch, max domain)
logits = logits.squeeze(-1)
embed_prods.add_(domain_biases)

logits = embed_prods.squeeze(-1)

# # (batch, max domain, 1 + # of init attrs)
# domain_feats = torch.cat([embed_prods, domain_cooccur], dim=-1)

# # (batch, 1 + # of init attrs, 1)
# cat_feat_W = self.cat_feat_W.index_select(0, cat_attr_idxs.view(-1)).view(domain_feats.shape[0],
# *self.cat_feat_W.shape[1:])
# # (batch, max domain)
# logits = domain_feats.matmul(cat_feat_W).squeeze(-1)

# Add mask to void out-of-domain indexes
# (batch, max domain)
Expand Down Expand Up @@ -992,9 +1074,6 @@ def forward(self, is_categorical, attr_idxs,
domain_idxs, domain_masks):
"""
Performs one forward pass.

is_categorical: (batch, 1)
attr_idxs: (batch, 1)
"""
# (batch, embed size, 1)
combined_init = self._get_combined_init_vec(init_cat_idxs, init_numvals,
Expand All @@ -1010,7 +1089,8 @@ def forward(self, is_categorical, attr_idxs,
domain_idxs[cat_mask], \
domain_masks[cat_mask]
# (# of cat VIDs, max_cat_domain)
cat_logits = self._cat_forward(cat_combined_init, domain_idxs, domain_masks)
cat_logits = self._cat_forward(cat_combined_init, domain_idxs,
domain_masks)

pred_numvals = torch.empty(0, self._max_num_dim)
if len(num_mask):
Expand Down Expand Up @@ -1064,7 +1144,7 @@ def train(self, num_epochs=10, batch_size=32, weight_entropy_lambda=0.,

# Returns VIDs to train on.
sampler = VidSampler(self.domain_df, self.ds.get_raw_data(),
self._numerical_attr_groups,
self._numerical_attrs, self._numerical_attr_groups,
shuffle=shuffle, train_only_clean=train_only_clean)

logging.debug("%s: training (lambda = %f) on %d cells (%d cells in total) in:\n1) %d categorical columns: %s\n2) %d numerical columns: %s",
Expand Down Expand Up @@ -1375,7 +1455,7 @@ def validate(self):
def calc_rmse(df_filter):
if df_filter.sum() == 0:
return 0
X_cor = df_res.loc[df_filter, '_value_'].apply(lambda arr: arr[0]).values.astype(np.float)
X_cor = df_res.loc[df_filter, '_value_'].apply(lambda arr: arr[0] if arr[0] != '_nan_' else 0.).values.astype(np.float)
X_inferred = df_res.loc[df_filter, 'inferred_val'].values.astype(np.float)
assert X_cor.shape == X_inferred.shape
return np.sqrt(np.mean((X_cor - X_inferred) ** 2))
Expand Down
Loading