-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_process.py
222 lines (169 loc) · 7.79 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import numpy as np
import pandas as pd
import daytime
import re
from scipy.stats.stats import pearsonr
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import timeit
import pickle
import scipy
####################################################################################################################################
############# CLEAN THE DATASET TO EXTRACT TRAJECTORIES ############################################################################
####################################################################################################################################
def data_processing():
print("Loading the raw data file... ")
# IMPORTING THE RAW DATA OF SUBOPTIMAL POLICY
df = pd.read_csv('raw_data.txt', sep='\t', engine='python', nrows=1000, encoding= 'utf-16')
print("/*************/")
print("File loaded... ")
# FUNCTIIONS USED TO CLEAN THE ABOVE RAW DATA
def read_data(file, enc='utf16', delim='\t', decim='.'):
with open(file, newline='', encoding=enc) as datafile:
return pd.read_csv(datafile, skiprows=0, header=0, delimiter=delim, decimal=decim, low_memory=False)
def load_pkl(file):
with open(file, 'rb') as inp:
return pickle.load(inp)
def save_pkl(var, file):
with open(file, 'wb') as output:
pickle.dump(var, output, pickle.HIGHEST_PROTOCOL)
def running_mean(x, n):
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[n:] - cumsum[:-n]) / float(n)
def running_min(x, n):
min_list = pd.DataFrame(x).rolling(n).min()
return min_list.loc[(n-1):]
def running_max(x, n):
max_list = pd.DataFrame(x).rolling(n).max()
return max_list.loc[(n-1):]
def moving_avg_no_pad(pointList, winWidth):
cumsum, moving_aves = [0], []
#cumsum = [0] + list(accumulate(pointList))
cumsum = np.cumsum(np.insert(pointList, 0, 0))
for i, x in list(enumerate(pointList, 1)):
if winWidth % 2 == 0:
if i <= winWidth // 2:
moving_ave = (cumsum[i + winWidth // 2 - 1] - cumsum[0]) / (i + winWidth // 2 - 1)
moving_aves.append(moving_ave)
elif i > len(pointList) - winWidth // 2 + 1:
moving_ave = (cumsum[len(pointList)] - cumsum[i - winWidth // 2 - 1]) / (
len(pointList) - i + winWidth // 2 + 1)
moving_aves.append(moving_ave)
else:
moving_ave = (cumsum[i + winWidth // 2 - 1] - cumsum[i - winWidth // 2 - 1]) / winWidth
moving_aves.append(moving_ave)
else:
if i <= winWidth // 2:
moving_ave = (cumsum[i + winWidth // 2] - cumsum[0]) / (i + winWidth // 2)
moving_aves.append(moving_ave)
elif i > len(pointList) - winWidth // 2:
moving_ave = (cumsum[len(pointList)] - cumsum[i - winWidth // 2 - 1]) / (
len(pointList) - i + winWidth // 2 + 1)
moving_aves.append(moving_ave)
else:
moving_ave = (cumsum[i + winWidth // 2] - cumsum[i - winWidth // 2 - 1]) / winWidth
moving_aves.append(moving_ave)
return moving_aves
def range_sign(x, pos_thres=0):
if x > pos_thres:
return 1
elif x < -pos_thres:
return -1
else:
return 0
def data_statistics_preprocess(df, target_column='Inc Electrical Antenna Tilt (deg)',drop_columns=[], pick_columns=[], split=0.7, reward_limit=20):
df = df.drop(columns=['CGI'])
# df = df.dropna(how='any')
# Theoretical range of angle is considered as (0-25)
df['Electrical Antenna Tilt (deg)'] = df['Electrical Antenna Tilt (deg)'].apply(
lambda x: float(x) / 25)
# Theoretical range of throughput is considered as (4,300,000 Kbps)
df['Average UE PDCP DL Throughput (Kbps)'] = df['Average UE PDCP DL Throughput (Kbps)'].apply(
lambda x: float(x) / 4300000)
# Theoretical range of number of calls is considered as (3,200,000)
df['Num Calls'] = df['Num Calls'].apply(
lambda x: float(x) / 3200000)
# Theoretical range of number of calls is considered as (-1 to 499)
df['Time Advance Overshooting Factor'] = df['Time Advance Overshooting Factor'].apply(
lambda x: float(x) / 500)
# Theoretical range of number of calls is considered as (100)
df['Inter Site Distance (Km)'] = df['Inter Site Distance (Km)'].apply(
lambda x: float(x) / 100)
df['Var Electrical Antenna Tilt (deg)'] = df['Var Electrical Antenna Tilt (deg)']
# Theoretical range of number of calls is considered as (100)
df['Inc Electrical Antenna Tilt (deg)'] = df ['Inc Electrical Antenna Tilt (deg)'].apply(
lambda x: np.sign(x))
# Percentages normalised by 100
df['Low RSRP Samples Rate Edge (%)'] = df['Low RSRP Samples Rate Edge (%)'].apply(
lambda x: float(x) / 100)
df['Number of Times Interf (%)'] = df['Number of Times Interf (%)'].apply(
lambda x: float(x) / 100)
df['Number of Cells High Overlap High Rsrp Src Agg (%)'] = df['Number of Cells High Overlap High Rsrp Src Agg (%)']\
.apply(lambda x: float(x) / 100)
df['Number of Cells High Overlap High Rsrp Tgt Agg (%)'] = df['Number of Cells High Overlap High Rsrp Tgt Agg (%)']\
.apply(lambda x: float(x) / 100)
df['PDCCH CCE High Load (%)'] = df['PDCCH CCE High Load (%)'].apply(lambda x: float(x) / 100)
df.reset_index(drop=True, inplace=True)
return df
def split_df(df, inputs=None, outputs=None):
if outputs is None:
raise Exception('Output cannot be None when splitting'.format())
if inputs is None:
return df.drop(outputs), df[outputs]
return df[inputs], df[outputs]
# OBTAIN THE CLEAN VERSION OF THE RAW DATA
clean_data = data_statistics_preprocess(df)
print("columns:", clean_data.columns)
cols = ["Electrical Antenna Tilt (deg)",
"Average UE PDCP DL Throughput (Kbps)",
"Num Calls",
"Low RSRP Samples Rate Edge (%)",
"FUZZY_LOW_RSRP_SAMPLES_EDGE_HIGH",
"Number of Times Interf (%)",
"FUZZY_NUM_TIMES_INTERF_HIGH",
"Time Advance Overshooting Factor",
"FUZZY_OSF_HIGH",
"Number of Cells High Overlap High Rsrp Src Agg (%)",
"FUZZY_NUM_CELLS_HIGH_OVERLAP_SRC_HIGH",
"Number of Cells High Overlap High Rsrp Tgt Agg (%)",
"FUZZY_NUM_TIMES_HIGH_OVERLAP_TGT_HIGH",
"Inter Site Distance (Km)",
"PDCCH CCE High Load (%)",
"Inc Electrical Antenna Tilt (deg)",
"Var Electrical Antenna Tilt (deg)",
"Reward_Weighted_Delta_Driver_HighLevel_KPIs"]
clean_data = clean_data[cols]
clean_data = clean_data.dropna(how='any')
print("Shape of clean data:", clean_data.shape)
# Create state space
def create_state(df):
row_list = []
row_list2 = []
for index, rows in df.iterrows():
my_list =[rows.iloc[1],
rows.iloc[2],
rows.iloc[3],
rows.iloc[4],
rows.iloc[5],
rows.iloc[6],
rows.iloc[7],
rows.iloc[8],
rows.iloc[9],
rows.iloc[10],
rows.iloc[11],
rows.iloc[12],
rows.iloc[13],
rows.iloc[14]]
row_list.append(my_list)
row_list2.append(rows.iloc[17])
df['state'] = row_list
df['action'] = row_list2
rl_df = df[["state", "action", "Reward_Weighted_Delta_Driver_HighLevel_KPIs"]]
rl_df = rl_df.rename(columns={ "Reward_Weighted_Delta_Driver_HighLevel_KPIs":"reward"})
return rl_df
newdf = create_state(clean_data)
newdf = newdf.reset_index()
newdf = newdf.drop(columns=['index'])
print("Shape of new df:", newdf.shape)
return newdf