-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmyTextPreprocessing.py
More file actions
98 lines (95 loc) · 4.63 KB
/
myTextPreprocessing.py
File metadata and controls
98 lines (95 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
import string
from bs4 import BeautifulSoup
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
class MyTextPreproccesor:
def clean(text):
tok = WordPunctTokenizer()
pat1 = '@[\w\-]+' # for @
pat2 = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # for url
pat3 = '#[\w\-]+' # for hashtag
pat4 = ''
pat5 = '[' + string.punctuation + ']' # for punctuation
pat6 = '[^\x00-\x7f]'
soup = BeautifulSoup(text, 'html.parser') # html decoding ("@amp")
souped = soup.get_text()
souped = re.sub(pat1, '', souped) # remove @
souped = re.sub(pat2, '', souped) # remove url
souped = re.sub(pat4, '', souped) # remove strange symbols
souped = re.sub(pat5, '', souped) # remove punctuation
souped = re.sub(pat3, '', souped) # remove "#" symbol and keeps the words
clean = re.sub(pat6, '', souped) # remove non-ascii characters
lower_case = clean.lower() # convert to lowercase
words = tok.tokenize(lower_case)
return (" ".join(words)).strip()
def my_clean(text,stops = False,stemming = False,minLength = 2):
text = text.lower().split()
text = [w for w in text if len(w) >= minLength]
if stemming and stops:
text = [word for word in text if word not in stopwords.words('english')]
wordnet_lemmatizer = WordNetLemmatizer()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
text = [englishStemmer.stem(word) for word in text]
text = [wordnet_lemmatizer.lemmatize(word) for word in text]
#text = [lancaster.stem(word) for word in text]
text = [word for word in text if word not in stopwords.words('english')]
elif stops:
text = [word for word in text if word not in stopwords.words('english')]
elif stemming:
wordnet_lemmatizer = WordNetLemmatizer()()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
text = [englishStemmer.stem(word) for word in text]
text = [wordnet_lemmatizer.lemmatize(word) for word in text]
text = " ".join(text)
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
text = text.lower().split()
text = [w for w in text if len(w) >= minLength]
if stemming and stops:
text = [word for word in text if word not in stopwords.words('english')]
wordnet_lemmatizer = WordNetLemmatizer()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
text = [englishStemmer.stem(word) for word in text]
text = [wordnet_lemmatizer.lemmatize(word) for word in text]
# text = [lancaster.stem(word) for word in text]
text = [word for word in text if word not in stopwords.words('english')]
elif stops:
text = [word for word in text if word not in stopwords.words('english')]
elif stemming:
wordnet_lemmatizer = WordNetLemmatizer()()
englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
text = [englishStemmer.stem(word) for word in text]
text = [wordnet_lemmatizer.lemmatize(word) for word in text]
text = " ".join(text)
return text