import pandas as pd
import numpy as np

log2fc_df = pd.read_csv("../CombinedLog2FoldChanges.txt", sep = '\t')
print(log2fc_df.shape)
log2fc_df.head()

(456600, 5)


log2fc_df.loc[log2fc_df['log2FoldChange'].isna(), 'log2FoldChange'] = 0


# breakdown between libraries
log2fc_df['lib'].value_counts()

HCT116_1    91320
GBM         91320
RPE1        91320
HeLa        91320
DLD1        91320
Name: lib, dtype: int64


log2fc_df['seq'].map(lambda s: len(s)).value_counts()

20    456600
Name: seq, dtype: int64


essential_data = log2fc_df[(log2fc_df['essential'] == 1) | (log2fc_df['gene'] == 'chr10')].copy()
#essential_data.loc[essential_data['log2FoldChange'].isna(), 'log2FoldChange'] = 0
essential_data = essential_data.reset_index(drop = True)
essential_data['lib'].value_counts()

GBM         2888
HCT116_1    2888
RPE1        2888
HeLa        2888
DLD1        2888
Name: lib, dtype: int64


seq_array = pd.DataFrame(np.array([list(x) for x in essential_data['seq']]))
seq_array_1hot = pd.get_dummies(seq_array)
print(seq_array_1hot.shape)
seq_array_1hot.head()

(14440, 76)


seq_array_1hot.columns.values

array(['0_A', '0_C', '0_G', '0_T', '1_A', '1_C', '1_G', '1_T', '2_A',
       '2_C', '2_G', '2_T', '3_A', '3_C', '3_G', '3_T', '4_A', '4_C',
       '4_G', '4_T', '5_A', '5_C', '5_G', '5_T', '6_A', '6_C', '6_G',
       '6_T', '7_A', '7_C', '7_G', '7_T', '8_A', '8_C', '8_G', '8_T',
       '9_A', '9_C', '9_G', '9_T', '10_A', '10_C', '10_G', '10_T', '11_A',
       '11_C', '11_G', '11_T', '12_A', '12_C', '12_G', '12_T', '13_A',
       '13_C', '13_G', '13_T', '14_A', '14_C', '14_G', '14_T', '15_A',
       '15_C', '15_G', '15_T', '16_A', '16_C', '16_G', '17_A', '17_C',
       '17_G', '18_A', '18_C', '18_G', '19_A', '19_C', '19_G'],
      dtype=object)


seq_array[19].value_counts()

C    6780
A    4180
G    3480
Name: 19, dtype: int64


new_cols = seq_array_1hot.columns.values.tolist() + ['16_T', '17_T', '18_T', '19_T']
print(new_cols)
seq_array_1hot = seq_array_1hot.reindex(columns =  new_cols, fill_value=0)
seq_array_1hot.head()

['0_A', '0_C', '0_G', '0_T', '1_A', '1_C', '1_G', '1_T', '2_A', '2_C', '2_G', '2_T', '3_A', '3_C', '3_G', '3_T', '4_A', '4_C', '4_G', '4_T', '5_A', '5_C', '5_G', '5_T', '6_A', '6_C', '6_G', '6_T', '7_A', '7_C', '7_G', '7_T', '8_A', '8_C', '8_G', '8_T', '9_A', '9_C', '9_G', '9_T', '10_A', '10_C', '10_G', '10_T', '11_A', '11_C', '11_G', '11_T', '12_A', '12_C', '12_G', '12_T', '13_A', '13_C', '13_G', '13_T', '14_A', '14_C', '14_G', '14_T', '15_A', '15_C', '15_G', '15_T', '16_A', '16_C', '16_G', '17_A', '17_C', '17_G', '18_A', '18_C', '18_G', '19_A', '19_C', '19_G', '16_T', '17_T', '18_T', '19_T']


gene_one_hot = pd.get_dummies(essential_data['gene'], prefix = 'gene')
print(gene_one_hot.shape)
gene_one_hot.head()

(14440, 321)


X = seq_array_1hot.merge(gene_one_hot, left_index = True, right_index = True)
y = essential_data['log2FoldChange']


from IPython.display import display, HTML
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,explained_variance_score
from tabulate import tabulate


shuffled_df = X.copy()
shuffled_df['y'] = y.copy()
shuffled_df = shuffled_df.sample(frac=1, replace=False).reset_index(drop=True)
shuffled_y = shuffled_df['y']
shuffled_X = shuffled_df.drop(['y'], axis = 1)
rf_model = RandomForestRegressor()
cv_scores = cross_val_score(rf_model, shuffled_X, shuffled_y, cv=5)
cv_scores = pd.DataFrame({'cv': [1, 2, 3, 4, 5],
                          'score': cv_scores})
display(HTML(cv_scores.to_html()))


cv_scores_summary = pd.DataFrame({'mean': [cv_scores['score'].mean()],
                                  'sd': [cv_scores['score'].std()]})
display(HTML(cv_scores_summary.to_html()))


rf_model = RandomForestRegressor()
lib_cv_scores = cross_val_score(rf_model, X, y, cv=5)
lib_cv_scores = pd.DataFrame({'lib': essential_data['lib'].unique(),
                              'score': lib_cv_scores})
display(HTML(lib_cv_scores.to_html()))


lib_cv_scores_summary = pd.DataFrame({'mean': [lib_cv_scores['score'].mean()],
                                      'sd': [lib_cv_scores['score'].std()]})
display(HTML(lib_cv_scores_summary.to_html()))

	seq	gene	log2FoldChange	lib
0	CACCTTCGAGCTGCTGCGCG	A1BG	-0.198332	DLD1
1	AAGAGCGCCTCGGTCCCAGC	A1BG	-0.631673	DLD1
2	TGGACTTCCAGCTACGGCGC	A1BG	-1.315708	DLD1
3	CACTGGCGCCATCGAGAGCC	A1BG	0.989644	DLD1
4	GCTCGGGCTTGTCCACAGGA	A1BG	0.021679	DLD1

	gene_ACTL6A	...
0	1	...
1	1	...
2	1	...
3	1	...
4	1	...

	cv	score
0	1	0.309174
1	2	0.274670
2	3	0.313010
3	4	0.336566
4	5	0.346031

What purpose does a test set serve?¶

Examples¶

Example: guide RNA design¶

Preprocessing¶

Simple train-test split¶

Split by library¶

Interpretation¶

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	16_G	17_A	17_C	17_G	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	1	1	0	0	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	1	0	0	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	0	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	1	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	0	0	1	0	1	0	0

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	1	0	0

	gene_ACTL6A	...
0	1	...
1	1	...
2	1	...
3	1	...
4	1	...

	lib	score
0	DLD1	0.466004
1	GBM	0.497410
2	HCT116_1	0.322926
3	HeLa	-0.278183
4	RPE1	0.352822

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	16_G	17_A	17_C	17_G	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	1	1	0	0	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	1	0	0	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	0	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	1	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	0	0	1	0	1	0	0

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	1	0	0

	gene_ACTL6A	...
0	1	...
1	1	...
2	1	...
3	1	...
4	1	...

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	16_G	17_A	17_C	17_G	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	1	1	0	0	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	1	0	0	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	0	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	1	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	0	0	1	0	1	0	0

	0_A	0_G	0_T	1_A	1_G	2_A	2_C	...	18_A	18_C	18_G	19_A	19_C	19_G
0	0	1	0	0	1	1	0	...	0	0	1	0	1	0
1	0	1	0	0	1	0	1	...	0	0	1	0	0	1
2	0	0	1	1	0	1	0	...	0	0	1	1	0	0
3	1	0	0	0	1	0	0	...	1	0	0	0	1	0
4	0	1	0	1	0	0	0	...	0	1	0	1	0	0

	gene_ACTL6A	...
0	1	...
1	1	...
2	1	...
3	1	...
4	1	...