-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrnn.py
157 lines (136 loc) · 6.19 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""Classes and functions for working with RNNs.
"""
import parse_data
import numpy as np
import tensorflow as tf
__author__ = 'Hayden Metsky <[email protected]>'
class LSTM:
"""Unidirectional and bidirectional LSTMs, which have been applied to
protein sequence.
This can optionally also use an embedding, but it doesn't make a lot of
sense here: the vocabulary size is already very small.
TODO: multiplicative LSTMs.
"""
def __init__(self, context_nt, units=64, bidirectional=False,
embed_dim=None, dropout_rate=0.5, regression=True,
class_weight=None, batch_size=32):
"""
Args:
context_nt: amount of context to use in target
units: dimensionality of LSTM output vector (and cell state vector)
bidirectional: if True, use bidirectional LSTM
embed_dim: if set, embed sequences with embedding layer and use
this as the dimensionality; otherwise, use one-hot
encoded sequence as input
dropout_rate: dropout rate before final layer
regression: if True, perform regression; else, classification
class_weight: class weight for training; only application for
classification
batch_size: batch size
"""
self.context_nt = context_nt
self.units = units
self.bidirectional = bidirectional
self.embed_dim = embed_dim
self.dropout_rate = dropout_rate
self.regression = regression
self.class_weight=class_weight
self.batch_size = batch_size
# get_params() and set_params() are needed if we which to use this
# class as a scikit-learn estimator
# (see https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator)
def get_params(self, deep=True):
return {'context_nt': self.context_nt,
'units': self.units,
'bidirectional': self.bidirectional,
'embed_dim': self.embed_dim,
'dropout_rate': self.dropout_rate,
'regression': self.regression,
'class_weight': self.class_weight,
'batch_size': self.batch_size}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def setup(self, seq_len):
"""Setup the model.
Args:
seq_len: length of each sequence; only used to specify input shape
to first layer
"""
final_activation = 'linear' if self.regression else 'sigmoid'
self.model = tf.keras.Sequential()
if self.embed_dim is not None:
# vocab size is 5 (A,C,G,T and no corresponding guide nt)
# input shape is (seq_len, 2) -- seq_len nt long, and at each
# position a value for the target and for the guide
self.model.add(tf.keras.layers.Embedding(
input_dim=5, output_dim=self.embed_dim,
input_shape=(seq_len, 2)))
# Merge the last two dimensions so that there's a single 1D
# vector at each position of the guide-target (rather than a
# vector [[embedded vector for target], [embedded vector for
# guide]] at each position); i.e., change the shape from
# [batch,length,2,embed_dim] -> [batch,length,2*embed_dim].
# Note that, with tf.keras... we ignore the batch dimension
# when reshaping
self.model.add(tf.keras.layers.Reshape(
(seq_len, 2*self.embed_dim)))
# The input has seq_len timesteps and each step has
# 2*self.embed_dim features
lstm_input_shape = (seq_len, 2*self.embed_dim)
else:
# The input has seq_len timesteps and each step has 8 features (4
# for the target and 4 for the guide)
lstm_input_shape = (seq_len, 8)
lstm = tf.keras.layers.LSTM(self.units, input_shape=lstm_input_shape)
if self.bidirectional:
self.model.add(tf.keras.layers.Bidirectional(lstm))
else:
self.model.add(lstm)
self.model.add(tf.keras.layers.Dropout(self.dropout_rate))
self.model.add(tf.keras.layers.Dense(1, activation=final_activation))
if self.regression:
self.model.compile('adam', 'mse', metrics=[
tf.keras.metrics.MeanSquaredError(),
tf.keras.metrics.MeanAbsoluteError()])
else:
self.model.compile('adam', 'binary_crossentropy', metrics=[
tf.keras.metrics.AUC(),
tf.keras.metrics.Accuracy()])
def fit(self, x_train, y_train, max_num_epochs=1000):
"""Fit the model.
Args:
x_train/y_train: training data
max_num_epochs: maximum number of epochs to run (early stopping
should stop before this)
"""
# Setup model; do this again in case parameters changed
seq_len = len(x_train[0])
self.setup(seq_len)
if self.embed_dim is not None:
x_train = np.array([parse_data.input_vec_for_embedding(x,
self.context_nt) for x in x_train])
# Setup early stopping
# The validation data is only used for early stopping
# Note that this uses a random train/val split to decide when to stop
# early; this may not be ideal due to crRNA overlap between the
# train/val sets (will likely stop too late and overfit)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
mode='min', patience=2)
self.model.fit(x_train, y_train, validation_split=0.25,
batch_size=self.batch_size,
callbacks=[es], class_weight=self.class_weight,
epochs=max_num_epochs,
verbose=2)
def predict(self, x_test):
"""Make predictions:
Args:
x_test: input data for predictions
Returns:
predictions
"""
if self.embed_dim is not None:
x_test = np.array([parse_data.input_vec_for_embedding(x,
self.context_nt) for x in x_test])
return self.model.predict(x_test).ravel()