Skip to content

Commit 45c42ae

Browse files
Add files via upload
1 parent 192cd41 commit 45c42ae

File tree

2 files changed

+10
-6
lines changed

2 files changed

+10
-6
lines changed

relation_features.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from nltk.translate import bleu
1111
from nltk.translate.bleu_score import SmoothingFunction
1212
from sentence_transformers import SentenceTransformer, util
13+
import re
1314

1415
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
1516
smoothie = SmoothingFunction().method4
@@ -22,10 +23,8 @@ def transformer_similarity(text1, text2):
2223
"""
2324
Use sentence transformer to calculate similarity between two sentences.
2425
"""
25-
text1 = text1.split("_")
26-
text2 = text2.split("_")
27-
text1 = [t.lower() for t in text1]
28-
text2 = [t.lower() for t in text2]
26+
text1,text2 = text1.lower(), text2.lower()
27+
text1 = re.split(r'[\s\-\_\.]', text1)
2928
text1 = " ".join(text1).strip()
3029
text2 = " ".join(text2).strip()
3130
embeddings1 = model.encode(text1)

self_features.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy as np
33
import re
44
from dateutil.parser import parse as parse_date
5+
import random
56

67
unit_dict = {"万": 10000, "亿": 100000000, "萬": 10000, "億": 100000000, "K+": 1000, "M+": 1000000, "B+": 1000000000}
78

@@ -50,6 +51,10 @@ def extract_numeric(data_list):
5051
"""
5152
Extracts numeric part(including float) from string list
5253
"""
54+
try:
55+
data_list = [float(d) for d in data_list]
56+
except:
57+
pass
5358
numeric_part = []
5459
unit = []
5560
for data in data_list:
@@ -152,7 +157,7 @@ def extract_features(data_list):
152157
"""
153158
Extract some features from the given data(column) or list
154159
"""
155-
data_list = [d for d in data_list if d is not np.NaN and d != "--"]
160+
data_list = [d for d in data_list if d == d and d != "--"]
156161
if len(data_list) == 0:
157162
return 0
158163
data_types = ("url","numeric","date","string")
@@ -204,5 +209,5 @@ def make_self_features_from(filepath):
204209
return features
205210

206211
if __name__ == '__main__':
207-
features = make_self_features_from("Training Data/pair_3/Table2.csv")
212+
features = make_self_features_from("Training Data/pair_7/Table1.csv")
208213
print(features)

0 commit comments

Comments
 (0)