-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_data.py
More file actions
63 lines (54 loc) · 1.86 KB
/
filter_data.py
File metadata and controls
63 lines (54 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
from collections import Counter
inter_file_path = './uk_session_train.txt'
session_min_inter = 2
item_min_inter = 5
filter_file_path = r'./uk_session_filter_session_'+str(session_min_inter)+'_item_'+str(item_min_inter)+'.txt'
session_list = []
with open(inter_file_path,'r') as f:
for idx,session in enumerate(f.readlines()):
tmp = session.strip().split(' ')
session_list.append(tmp)
print('all session num '+len(session_list))
def drop_item(sessions,limit=item_min_inter):
aaa = {}
for i in sessions:
for j in i:
if j in aaa:
aaa[j] +=1
else:
aaa[j] = 1
filter_aaa = {}
for i in aaa:
if aaa[i]>=limit:
filter_aaa[i]=aaa[i]
return filter_aaa
def drop_session(sessions,aaa,limit = session_min_inter):
new_sessions = []
new_items = set(aaa.keys())
for session in sessions:
tmp = list(filter(lambda x:x in new_items,session))
if len(tmp)>=limit:
new_sessions.append(tmp)
return new_sessions
# sessions = [['1','2','3'],['2','4'],['1','2','3'],['1','2','3'],['1','2','3']]
# aaa = {'1':10,'2':3,'3':6,'4':1}
if __name__ == '__main__':
while True:
flag = session_list
tmp = drop_item(session_list)
# print(aaa)
session_list = drop_session(session_list,tmp)
if session_list == flag:
break
print('all seesion num (after filtering): '+ str(len(session_list)))
print('min seesion len: '+str(min(map(lambda x: len(x), session_list))))
final_item = {}
for sessions in session_list:
for item in sessions:
if item in final_item:
final_item[item]+=1
else:
final_item[item]=1
print('all item num: '+ str(len(final_item)))
print('min item freq:'+ str(min(final_item.values())))