-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathregex_utils.py
More file actions
89 lines (65 loc) · 2.11 KB
/
regex_utils.py
File metadata and controls
89 lines (65 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
def clean_tweet(tweet):
'''
Utility function to clean tweet text by removing links, special characters
using simple regex statements.
'''
return ' '.join(re.sub("((RT)|(rt)|@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def get_url(url, **kwargs):
# sep = '\\' # remove everything after separator
# url = str(url.encode('utf-8')).split(sep, 1)[0]
try:
url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
if 'strip' in kwargs:
return str(url).strip(r'^\[".|(\\\\xa0)|(\\\\xc)|(\\\\xe2)|(\\\\u206)|()|\'\]\"\]$')
else:
return url[0]
except Exception as e:
print('Error in:' + str(url))
print(e)
pass
def find_domain(url):
try:
return str(url).split("//")[-1].split("/")[0].split('?')[0]
except Exception as e:
print('Error in:' + str(url))
print(e)
pass
def find_subdomain(url):
# remove http:// and https://
url = re.sub(r"http.?://", '', url)
# remove empy spaces
url = re.sub(r" ", '', url)
url = url.split('?')[0]
url = re.sub(r".php", '', url)
folder = (url.count('/'))
if folder == 1:
return url
else:
return "/".join(url.split('/', 2)[:2])
def check_domain(url, domain):
if domain in find_domain(url).split('.'):
return True
else:
return False
def set_query_from_url(url):
url = re.sub(r"http.?://", '', url)
query = ' '.join(re.split('/|-|\.', url))
return query
def strip_urls(text):
try:
if type(text) == str:
# remove urls with and without http(s) or www
text = re.sub(r'[^ ]+\.[^ ]+', '', text, flags=re.MULTILINE)
return text
else:
pass
except ValueError:
pass
def get_link_title(link):
if link:
try:
title = re.findall('<a.*?>(.*)</a>', link)[0]
return title
except IndexError:
pass