From cf281af200b745f7061421c9e2caee40de81aaf8 Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 18:25:22 +0000 Subject: [PATCH 01/12] Add generic checker Regex. Remove 13 duplicates --- src/Fixtures/Crawlers.php | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index f419df99..75260315 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -51,7 +51,6 @@ class Crawlers extends AbstractProvider '13TABS', '192\.comAgent', '2ip\.ru', - '404checker', '404enemy', '7Siters', '80legs', @@ -188,7 +187,6 @@ class Crawlers extends AbstractProvider 'CakePHP', 'Calculon', 'Canary%20Mail', - 'CapsuleChecker', 'CaretNail', 'catexplorador', 'CC Metadata Scaper', @@ -224,7 +222,6 @@ class Crawlers extends AbstractProvider 'colly -', 'CommaFeed', 'Commons-HttpClient', - 'Comodo SSL Checker', 'contactbigdatafr', 'contentkingapp', 'convera', @@ -338,7 +335,6 @@ class Crawlers extends AbstractProvider 'FeedBucket', 'FeedBunch\/[0-9]', 'FeedBurner', - 'FeedChecker', 'Feedly', 'Feedreader', 'FeedshowOnline', @@ -366,11 +362,9 @@ class Crawlers extends AbstractProvider 'http:\/\/www.neomo.de\/', //'Francis [Bot]' 'free thumbnails', 'Freeuploader', - 'FreeWebMonitoring SiteChecker', 'Funnelback', 'G-i-g-a-b-o-t', 'g00g1e\.net', - 'GAChecker', 'ganarvisitas\/[0-9]', 'geek-tools', 'Genderanalyzer', @@ -429,7 +423,6 @@ class Crawlers extends AbstractProvider 'Gookey', 'GoScraper', 'GoSpotCheck', - 'GoSquared-Status-Checker', 'gosquared-thumbnailer', 'Gotit', 'GoZilla', @@ -572,7 +565,6 @@ class Crawlers extends AbstractProvider 'JS-Kit', 'JustView', 'Kaspersky Lab CFR link resolver', - 'KeepRight OpenStreetMap Checker', 'Kelny\/', 'Kerrigan\/', 'KeyCDN', @@ -604,7 +596,6 @@ class Crawlers extends AbstractProvider 'Liferea\/', 'Lightspeedsystems', 'Likse', - 'link checker', 'Link Valet', 'link_thumbnailer', 'LinkAlarm\/', @@ -869,7 +860,6 @@ class Crawlers extends AbstractProvider 'Recorder', 'RecurPost\/', 'redback\/', - 'Redirect Checker Tool', 'ReederForMac', 'ReGet', 'RepoMonkey', @@ -1091,7 +1081,6 @@ class Crawlers extends AbstractProvider 'Upflow', 'Uptimia', 'URL Verifier', - 'URLChecker', 'URLitor\.com', 'urlresolver', 'Urlstat', @@ -1117,7 +1106,6 @@ class Crawlers extends AbstractProvider 'Vulnbusters Meter', 'VYU2', 'w3af\.org', - 'W3C_I18n-Checker', 'W3C_Unicorn', 'W3C-checklink', 'W3C-mobileOK', @@ -1172,7 +1160,6 @@ class Crawlers extends AbstractProvider 'Website\ Quester', 'WebsiteExtractor', 'websitepulse agent', - 'websitepulse[+ ]checker', 'WebsiteQuester', 'Websnapr\/', 'Webster', @@ -1256,6 +1243,6 @@ class Crawlers extends AbstractProvider 'Zombie\.js', 'Zoom\.Mac', 'ZyBorg', - '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron)', + '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker)', ); } From ffef5bad1effbea1f81cdaac8eb1104d926a59bd Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 18:30:16 +0000 Subject: [PATCH 02/12] Add generic regex 'reader'. Remove 10 duplicates. Add missing useragent --- src/Fixtures/Crawlers.php | 12 +----------- tests/crawlers.txt | 3 ++- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 75260315..748ba634 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -163,7 +163,6 @@ class Crawlers extends AbstractProvider 'BlogSearch', 'Blogtrottr', 'BlowFish', - 'Boardreader', 'boitho\.com-dc', 'BPImageWalker', 'Braintree-Webhooks', @@ -276,7 +275,6 @@ class Crawlers extends AbstractProvider 'Download\ Wonder', 'downnotifier\.com', 'DowntimeDetector', - 'Dragonfly File Reader', 'Drip', 'drupact', 'Drupal \(\+http:\/\/drupal\.org\/\)', @@ -336,7 +334,6 @@ class Crawlers extends AbstractProvider 'FeedBunch\/[0-9]', 'FeedBurner', 'Feedly', - 'Feedreader', 'FeedshowOnline', 'Feedspot', 'Feedwind\/[0-9]', @@ -447,7 +444,6 @@ class Crawlers extends AbstractProvider 'Hadi Agent', 'Hatena', 'Havij', - 'hawkReader', 'HeadlessChrome', 'HEADMasterSEO', 'HeartRails_Capture', @@ -643,7 +639,6 @@ class Crawlers extends AbstractProvider 'MegaIndex\.ru', 'MeltwaterNews', 'Melvil Rawi\/', - 'MergeFlow-PageReader', 'Metaspinner', 'MetaURI', 'MFC_Tear_Sample', @@ -743,7 +738,6 @@ class Crawlers extends AbstractProvider 'Offline\ Navigator', 'og-scraper\/', 'okhttp', - 'Omea Reader', 'omgili', 'OMSC', 'Online Domain Tools', @@ -872,9 +866,7 @@ class Crawlers extends AbstractProvider 'Robozilla\/[0-9]', 'ROI Hunter', 'RPT-HTTPClient', - 'rss reader', 'RSSOwl', - 'RssReader\/', 'safe-agent-scanner', 'SalesIntelligent', 'Saleslift', @@ -932,7 +924,6 @@ class Crawlers extends AbstractProvider 'ShortLinkTranslate', 'shrinktheweb', 'Sideqik', - 'SilverReader', 'SimplePie', 'SimplyFast', 'Siphon', @@ -1037,7 +1028,6 @@ class Crawlers extends AbstractProvider 'The\ Intraformant', 'theinternetrules', 'TheNomad', - 'theoldreader\.com', 'Thinklab', 'Thumbshots', 'ThumbSniper', @@ -1243,6 +1233,6 @@ class Crawlers extends AbstractProvider 'Zombie\.js', 'Zoom\.Mac', 'ZyBorg', - '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker)', + '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader)', ); } diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 799c311d..c971ec25 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3406,4 +3406,5 @@ Canary%20Mail/397 CFNetwork/893.13.1 Darwin/17.4.0 (x86_64) Sendsay.Ru/1.0; https://Sendsay.Ru/; ask@sendsay.ru Mozilla/5.0 (Zoom.Mac 10.8.5 x86) Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 TryJsoup/1.0 (+http://try.jsoup.org/) -Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) \ No newline at end of file +Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) +SilverReader/1.0; http://silverreader.com \ No newline at end of file From f42578bf504ac9c18b7fad9319f9015cc1425523 Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 20:36:56 +0000 Subject: [PATCH 03/12] Remove [0-9] ranges where not needed. 34 ranges removed. 1 regex removed as now a duplicate. --- src/Fixtures/Crawlers.php | 71 +++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 748ba634..3bc99542 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -102,8 +102,8 @@ class Crawlers extends AbstractProvider 'ApacheBench\/', 'Apexoo', 'APIs-Google', - 'AportWorm\/[0-9]', - 'AppBeat\/[0-9]', + 'AportWorm\/', + 'AppBeat\/', 'AppEngine-Google', 'AppStoreScraperZ', 'Aprc\/[0-9]', @@ -136,8 +136,7 @@ class Crawlers extends AbstractProvider 'basicstate', 'BatchFTP', 'Battleztar\ Bazinga', - 'baypup\/[0-9]', - 'baypup\/colbert', + 'baypup\/', 'BazQux', 'BBBike', 'BCKLINKS', @@ -169,7 +168,7 @@ class Crawlers extends AbstractProvider 'Branch Metrics API', 'Branch-Passthrough', 'Brandprotect', - 'BrandVerity\/[0-9]', + 'BrandVerity\/', 'Brandwatch', 'Brodie\/', 'Browsershots', @@ -201,7 +200,7 @@ class Crawlers extends AbstractProvider 'checkprivacy', 'CherryPicker', 'ChinaClaw', - 'Chirp\/[0-9]', + 'Chirp\/', 'chkme\.com', 'Chlooe', 'Chromaxa', @@ -214,7 +213,7 @@ class Crawlers extends AbstractProvider 'Cloud\ mapping', 'CloudEndure', 'CloudFlare-AlwaysOnline', - 'Cloudinary\/[0-9]', + 'Cloudinary\/', 'cmcm\.com', 'coccoc', 'cognitiveseo', @@ -237,7 +236,7 @@ class Crawlers extends AbstractProvider 'curb', 'Curious George', 'curl', - 'cuwhois\/[0-9]', + 'cuwhois\/', 'cybo\.com', 'DAP\/NetHTTP', 'DareBoost', @@ -331,12 +330,12 @@ class Crawlers extends AbstractProvider 'Feedbin', 'FeedBooster', 'FeedBucket', - 'FeedBunch\/[0-9]', + 'FeedBunch\/', 'FeedBurner', 'Feedly', 'FeedshowOnline', 'Feedspot', - 'Feedwind\/[0-9]', + 'Feedwind\/', 'FeedZcollector', 'feeltiptop', 'Fetch API', @@ -355,14 +354,14 @@ class Crawlers extends AbstractProvider 'Flunky', 'flynxapp', 'forensiq', - 'FoundSeoTool\/[0-9]', + 'FoundSeoTool\/', 'http:\/\/www.neomo.de\/', //'Francis [Bot]' 'free thumbnails', 'Freeuploader', 'Funnelback', 'G-i-g-a-b-o-t', 'g00g1e\.net', - 'ganarvisitas\/[0-9]', + 'ganarvisitas\/', 'geek-tools', 'Genderanalyzer', 'Genieo', @@ -373,7 +372,7 @@ class Crawlers extends AbstractProvider 'getprismatic\.com', 'GetRight', 'getroot', - 'GetURLInfo\/[0-9]', + 'GetURLInfo\/', 'GetWeb', 'Ghost Inspector', 'GigablastOpenSource', @@ -389,7 +388,7 @@ class Crawlers extends AbstractProvider 'gofetch', 'GomezAgent', 'gooblog', - 'Goodzer\/[0-9]', + 'Goodzer\/', 'Google AppsViewer', 'Google Desktop', 'Google favicon', @@ -451,13 +450,13 @@ class Crawlers extends AbstractProvider 'heritrix', 'historious\/', 'hkedcity', - 'hledejLevne\.cz\/[0-9]', + 'hledejLevne\.cz', 'Hloader', 'HMView', 'Holmes', 'HonesoSearchEngine\/', 'HootSuite Image proxy', - 'Hootsuite-WebFeed\/[0-9]', + 'Hootsuite-WebFeed\/', 'hosterstats', 'HostTracker', 'ht:\/\/check', @@ -495,7 +494,7 @@ class Crawlers extends AbstractProvider 'Id-search', 'IdeelaborPlagiaat', 'IDG Twitter Links Resolver', - 'IDwhois\/[0-9]', + 'IDwhois\/', 'Iframely', 'igdeSpyder', 'IlTrovatore', @@ -539,7 +538,7 @@ class Crawlers extends AbstractProvider 'isitup\.org', 'iskanie', 'isUp\.li', - 'iThemes Sync\/[0-9]', + 'iThemes Sync\/', 'iZSearch', 'JAHHO', 'janforman', @@ -632,7 +631,7 @@ class Crawlers extends AbstractProvider 'MarkMonitor', 'MarkWatch', 'Mass\ Downloader', - 'masscan\/[0-9]', + 'masscan\/', 'Mata\ Hari', 'Mediapartners-Google', 'mediawords', @@ -664,7 +663,7 @@ class Crawlers extends AbstractProvider 'Mojolicious \(Perl\)', 'Monit\/', 'monitis', - 'Monitority\/[0-9]', + 'Monitority\/', 'montastic', 'MonTools', 'Moreover', @@ -731,7 +730,7 @@ class Crawlers extends AbstractProvider 'nWormFeedFinder', 'Nymesis', 'NYU', - 'Ocelli\/[0-9]', + 'Ocelli\/', 'Octopus', 'oegp', 'Offline Explorer', @@ -748,13 +747,13 @@ class Crawlers extends AbstractProvider 'OpenVAS', 'Optimizer', 'Orbiter', - 'OrgProbe\/[0-9]', + 'OrgProbe\/', 'orion-semantics', 'Outlook-Express', 'ow\.ly', 'Owler', 'ownCloud News', - 'OxfordCloudService\/[0-9]', + 'OxfordCloudService\/', 'Page Analyzer', 'Page Valet', 'page_verifier', @@ -764,7 +763,7 @@ class Crawlers extends AbstractProvider 'PageGrabber', 'PagePeeker', 'PageScorer', - 'Pagespeed\/[0-9]', + 'Pagespeed\/', 'Panopta', 'panscient', 'Papa\ Foto', @@ -815,7 +814,7 @@ class Crawlers extends AbstractProvider 'PowerPoint\/', 'Priceonomics Analysis Engine', 'PrintFriendly\.com', - 'PritTorrent\/[0-9]', + 'PritTorrent\/', 'Prlog', 'probethenet', 'Project 25499', @@ -858,12 +857,12 @@ class Crawlers extends AbstractProvider 'ReGet', 'RepoMonkey', 'request\.js', - 'ResponseCodeTest\/[0-9]', + 'ResponseCodeTest\/', 'RestSharp', 'Riddler', 'Rival IQ', 'Robosourcer', - 'Robozilla\/[0-9]', + 'Robozilla\/', 'ROI Hunter', 'RPT-HTTPClient', 'RSSOwl', @@ -884,7 +883,7 @@ class Crawlers extends AbstractProvider 'SimpleScraper', 'Scrapy', 'Screaming', - 'ScreenShotService\/[0-9]', + 'ScreenShotService\/', 'Scrubby', 'Scrutiny\/', 'search\.thunderstone', @@ -919,7 +918,7 @@ class Crawlers extends AbstractProvider 'Shelob', 'Shodan', 'Shoppimon Analyzer', - 'ShoppimonAgent\/[0-9]', + 'ShoppimonAgent\/', 'ShopWiki', 'ShortLinkTranslate', 'shrinktheweb', @@ -1021,7 +1020,7 @@ class Crawlers extends AbstractProvider 'teoma', 'terrainformatica\.com', 'Test Certificate Info', - 'Tetrahedron\/[0-9]', + 'Tetrahedron\/', 'The Drop Reaper', 'The Expert HTML Source Viewer', 'The Knowledge AI', @@ -1091,8 +1090,8 @@ class Crawlers extends AbstractProvider 'Voil', 'voltron', 'voyager\/', - 'VSAgent\/[0-9]', - 'VSB-TUO\/[0-9]', + 'VSAgent\/', + 'VSB-TUO\/', 'Vulnbusters Meter', 'VYU2', 'w3af\.org', @@ -1127,7 +1126,7 @@ class Crawlers extends AbstractProvider 'WebCookies', 'WebCopier', 'WebCorp', - 'WebDataStats\/[0-9]', + 'WebDataStats\/', 'WebDoc', 'WebEnhancer', 'WebFetch', @@ -1155,7 +1154,7 @@ class Crawlers extends AbstractProvider 'Webster', 'WebStripper', 'WebSucker', - 'Webthumb\/[0-9]', + 'Webthumb\/', 'WebThumbnail', 'WebWhacker', 'WebZIP', @@ -1197,7 +1196,7 @@ class Crawlers extends AbstractProvider 'x22Mozilla', 'XaxisSemanticsClassifier', 'Xenu Link Sleuth', - 'XING-contenttabreceiver\/[0-9]', + 'XING-contenttabreceiver\/', 'xpymep([0-9]?)\.exe', 'Y!J-(ASR|BSC)', 'Y\!J-BRW', @@ -1215,7 +1214,7 @@ class Crawlers extends AbstractProvider 'Yoleo Consumer', 'yoogliFetchAgent', 'YottaaMonitor', - 'Your-Website-Sucks\/[0-9]', + 'Your-Website-Sucks', 'yourls\.org', 'YoYs\.net', 'YP\.PL', From b4a086c2837ac4b157b037abd5dab9cfe7f1eeb1 Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 20:42:06 +0000 Subject: [PATCH 04/12] Add generic regex 'extractor'. Remove 6 duplicates. Add 2 missing useragents --- src/Fixtures/Crawlers.php | 8 +------- tests/crawlers.txt | 3 ++- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 3bc99542..4a81958f 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -70,7 +70,6 @@ class Crawlers extends AbstractProvider 'ADmantX', 'adressendeutschland', 'adscanner\/', - 'Advanced Email Extractor v', 'agentslug', 'AHC', 'aihit', @@ -294,7 +293,6 @@ class Crawlers extends AbstractProvider 'elefent', 'EMail Exractor', 'EMail\ Wolf', - 'Email%20Extractor', 'EmailWolf', 'Embarcadero', 'Embed PHP Library', @@ -312,7 +310,6 @@ class Crawlers extends AbstractProvider 'exif', 'Exploratodo', 'Express WebPictures', - 'ExtractorPro', 'Extreme\ Picture\ Finder', 'EyeNetIE', 'ezooms', @@ -563,7 +560,6 @@ class Crawlers extends AbstractProvider 'Kelny\/', 'Kerrigan\/', 'KeyCDN', - 'Keyword Extractor', 'Keyword\ Density', 'Keywords Research', 'KickFire', @@ -578,7 +574,6 @@ class Crawlers extends AbstractProvider 'L\.webis', 'Larbin', 'Lavf\/', - 'LayeredExtractor', 'LeechFTP', 'LeechGet', 'letsencrypt', @@ -1147,7 +1142,6 @@ class Crawlers extends AbstractProvider 'Webshot', 'Website Analyzer\/', 'Website\ Quester', - 'WebsiteExtractor', 'websitepulse agent', 'WebsiteQuester', 'Websnapr\/', @@ -1232,6 +1226,6 @@ class Crawlers extends AbstractProvider 'Zombie\.js', 'Zoom\.Mac', 'ZyBorg', - '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader)', + '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor)', ); } diff --git a/tests/crawlers.txt b/tests/crawlers.txt index c971ec25..6f75bbca 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3407,4 +3407,5 @@ Sendsay.Ru/1.0; https://Sendsay.Ru/; ask@sendsay.ru Mozilla/5.0 (Zoom.Mac 10.8.5 x86) Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 TryJsoup/1.0 (+http://try.jsoup.org/) Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) -SilverReader/1.0; http://silverreader.com \ No newline at end of file +SilverReader/1.0; http://silverreader.com +ExtractorPro \ No newline at end of file From fb9082fa4501c3c5eeaed390842cadee72629b6e Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 20:45:49 +0000 Subject: [PATCH 05/12] Add other missing extractor useragent test --- tests/crawlers.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/crawlers.txt b/tests/crawlers.txt index 6f75bbca..d33916e0 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3408,4 +3408,5 @@ Mozilla/5.0 (Zoom.Mac 10.8.5 x86) Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 TryJsoup/1.0 (+http://try.jsoup.org/) Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) SilverReader/1.0; http://silverreader.com -ExtractorPro \ No newline at end of file +ExtractorPro +WebsiteExtractor \ No newline at end of file From 40538217433975948700a5e13fcf0b7ee683c9d9 Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 22:47:46 +0000 Subject: [PATCH 06/12] Remove \.com from some regexs --- src/Fixtures/Crawlers.php | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 4a81958f..a9d312e0 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -82,7 +82,7 @@ class Crawlers extends AbstractProvider 'alexa site audit', 'Alibaba\.Security\.Heimdall', 'Alligator', - 'allloadin\.com', + 'allloadin', 'AllSubmitter', 'alyze\.info', 'amagit', @@ -111,7 +111,7 @@ class Crawlers extends AbstractProvider 'Arachnophilia', 'aria2', 'Arukereso', - 'asafaweb\.com', + 'asafaweb', 'AskQuickly', 'Ask Jeeves', 'ASPSeek', @@ -222,7 +222,7 @@ class Crawlers extends AbstractProvider 'contactbigdatafr', 'contentkingapp', 'convera', - 'CookieReports\.com', + 'CookieReports', 'copyright sheriff', 'CopyRightCheck', 'Copyscape', @@ -271,7 +271,7 @@ class Crawlers extends AbstractProvider 'dotSemantic', 'downforeveryoneorjustme', 'Download\ Wonder', - 'downnotifier\.com', + 'downnotifier', 'DowntimeDetector', 'Drip', 'drupact', @@ -366,7 +366,7 @@ class Crawlers extends AbstractProvider 'GetCode', 'Getintent', 'GetLinkInfo', - 'getprismatic\.com', + 'getprismatic', 'GetRight', 'getroot', 'GetURLInfo\/', @@ -459,7 +459,7 @@ class Crawlers extends AbstractProvider 'ht:\/\/check', 'htdig', 'HTMLparser', - 'htmlyse\.com', + 'htmlyse', 'HTTP Banner Detection', 'HTTP_Compression_Test', 'http_request2', @@ -707,7 +707,7 @@ class Crawlers extends AbstractProvider 'Nibbler', 'NICErsPRO', 'Nikto', - 'nineconnections\.com', + 'nineconnections', 'NLNZ_IAHarvester', 'Nmap Scripting Engine', 'node-superagent', @@ -808,7 +808,7 @@ class Crawlers extends AbstractProvider 'postrank', 'PowerPoint\/', 'Priceonomics Analysis Engine', - 'PrintFriendly\.com', + 'PrintFriendly', 'PritTorrent\/', 'Prlog', 'probethenet', @@ -831,7 +831,7 @@ class Crawlers extends AbstractProvider 'QQDownload', 'QrafterPro', 'Qseero', - 'Qualidator\.com SiteAnalyzer', + 'Qualidator', 'QueryN\ Metasearch', 'queuedriver', 'Quora Link Preview', @@ -907,7 +907,7 @@ class Crawlers extends AbstractProvider 'seoscanners', 'SEOstats', 'Server Density Service Monitoring', - 'servernfo\.com', + 'servernfo', 'sexsearcher', 'Seznam', 'Shelob', @@ -1013,7 +1013,7 @@ class Crawlers extends AbstractProvider 'Telesphorep', 'Tenon\.io', 'teoma', - 'terrainformatica\.com', + 'terrainformatica', 'Test Certificate Info', 'Tetrahedron\/', 'The Drop Reaper', @@ -1037,13 +1037,13 @@ class Crawlers extends AbstractProvider 'TrapitAgent', 'Trendiction', 'Trendsmap', - 'trendspottr\.com', + 'trendspottr', 'truwoGPS', 'TryJsoup', 'TulipChain', 'Turingos', 'Turnitin', - 'tweetedtimes\.com', + 'tweetedtimes', 'Tweetminster', 'Tweezler\/', 'twibble', @@ -1065,7 +1065,7 @@ class Crawlers extends AbstractProvider 'Upflow', 'Uptimia', 'URL Verifier', - 'URLitor\.com', + 'URLitor', 'urlresolver', 'Urlstat', 'UrlTrends Ranking Updater', From 128c59996e1692e4a228fe3b8922aa170351d4bb Mon Sep 17 00:00:00 2001 From: Max King Date: Thu, 3 Jan 2019 22:57:48 +0000 Subject: [PATCH 07/12] Add generic regex 'monitoring'. Remove 6 duplicates. Add 1 missing Useragent --- src/Fixtures/Crawlers.php | 8 +------- tests/UATests.php | 2 +- tests/crawlers.txt | 3 ++- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index a9d312e0..475aa04c 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -395,7 +395,6 @@ class Crawlers extends AbstractProvider 'Google PP Default', 'Google Search Console', 'Google Web Preview', - 'google_partner_monitoring', 'Google-Adwords', 'Google-Apps-Script', 'Google-Calendar-Importer', @@ -407,7 +406,6 @@ class Crawlers extends AbstractProvider 'Google-Structured-Data-Testing-Tool', 'Google-Youtube-Links', 'google-xrawler', - 'GoogleCloudMonitoring', 'GoogleDocs', 'GoogleHC\/', 'GoogleProducer', @@ -906,7 +904,6 @@ class Crawlers extends AbstractProvider 'SeopultContentAnalyzer', 'seoscanners', 'SEOstats', - 'Server Density Service Monitoring', 'servernfo', 'sexsearcher', 'Seznam', @@ -991,7 +988,6 @@ class Crawlers extends AbstractProvider 'suchen', 'Sucuri', 'summify', - 'Super Monitoring', 'SuperHTTP', 'Surphace Scout', 'Suzuran', @@ -1102,7 +1098,6 @@ class Crawlers extends AbstractProvider 'WbSrch\/', 'WDT\.io', 'web-capture\.net', - 'Web-Monitoring', 'Web-sniffer', 'Web\ Auto', 'Web\ Collage', @@ -1196,7 +1191,6 @@ class Crawlers extends AbstractProvider 'Y\!J-BRW', 'Yaanb', 'yacy', - 'Yahoo Ad monitoring', 'Yahoo Link Preview', 'YahooCacheSystem', 'YahooYSMcm', @@ -1226,6 +1220,6 @@ class Crawlers extends AbstractProvider 'Zombie\.js', 'Zoom\.Mac', 'ZyBorg', - '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor)', + '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring)', ); } diff --git a/tests/UATests.php b/tests/UATests.php index 4a49687d..d2054548 100644 --- a/tests/UATests.php +++ b/tests/UATests.php @@ -51,7 +51,7 @@ public function it_returns_correct_matched_bot_name() $matches = $this->CrawlerDetect->getMatches(); - $this->assertEquals($this->CrawlerDetect->getMatches(), 'Yahoo Ad monitoring', $matches); + $this->assertEquals($this->CrawlerDetect->getMatches(), 'monitoring', $matches); } /** @test */ diff --git a/tests/crawlers.txt b/tests/crawlers.txt index d33916e0..113b75f8 100644 --- a/tests/crawlers.txt +++ b/tests/crawlers.txt @@ -3409,4 +3409,5 @@ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus) SilverReader/1.0; http://silverreader.com ExtractorPro -WebsiteExtractor \ No newline at end of file +WebsiteExtractor +Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1 google_partner_monitoring FWSzVTDDBz14547302713138T \ No newline at end of file From e7a8000fd6dce3e7b515c5a746196bb930357f2e Mon Sep 17 00:00:00 2001 From: Max King Date: Fri, 4 Jan 2019 18:00:56 +0000 Subject: [PATCH 08/12] Add generic regex 'analyzer'. Remove 9 duplicates. --- src/Fixtures/Crawlers.php | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 475aa04c..2f1b6b9e 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -75,7 +75,6 @@ class Crawlers extends AbstractProvider 'aihit', 'aiohttp\/', 'Airmail', - 'Akamai_Site_Analyzer', 'akka-http\/', 'akula\/', 'alertra', @@ -247,7 +246,6 @@ class Crawlers extends AbstractProvider 'dataprovider', 'DataXu', 'Daum(oa)?[ \/][0-9]', - 'DemandbasePublisherAnalyzer\/', 'Demon', 'DeuSu', 'developers\.google\.com\/\+\/web\/snippet\/', @@ -261,7 +259,6 @@ class Crawlers extends AbstractProvider 'DittoSpyder', 'dlvr', 'DMBrowser', - 'DNS-Tools Header-Analyzer', 'DNSPod-reporting', 'docoloc', 'Dolphin http client\/', @@ -360,7 +357,6 @@ class Crawlers extends AbstractProvider 'g00g1e\.net', 'ganarvisitas\/', 'geek-tools', - 'Genderanalyzer', 'Genieo', 'GentleSource', 'GetCode', @@ -747,12 +743,10 @@ class Crawlers extends AbstractProvider 'Owler', 'ownCloud News', 'OxfordCloudService\/', - 'Page Analyzer', 'Page Valet', 'page_verifier', 'page\ scorer', 'page2rss', - 'PageAnalyzer', 'PageGrabber', 'PagePeeker', 'PageScorer', @@ -901,7 +895,6 @@ class Crawlers extends AbstractProvider 'Seomoz', 'SEOprofiler', 'SEOsearch\/', - 'SeopultContentAnalyzer', 'seoscanners', 'SEOstats', 'servernfo', @@ -909,8 +902,7 @@ class Crawlers extends AbstractProvider 'Seznam', 'Shelob', 'Shodan', - 'Shoppimon Analyzer', - 'ShoppimonAgent\/', + 'Shoppimon', 'ShopWiki', 'ShortLinkTranslate', 'shrinktheweb', @@ -1135,7 +1127,6 @@ class Crawlers extends AbstractProvider 'webscreenie', 'Webshag', 'Webshot', - 'Website Analyzer\/', 'Website\ Quester', 'websitepulse agent', 'WebsiteQuester', @@ -1220,6 +1211,6 @@ class Crawlers extends AbstractProvider 'Zombie\.js', 'Zoom\.Mac', 'ZyBorg', - '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring)', + '[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)', ); } From 5c2021f963d44cbc614453249cb38a3c93a725a9 Mon Sep 17 00:00:00 2001 From: Max King Date: Fri, 4 Jan 2019 18:12:55 +0000 Subject: [PATCH 09/12] Remove \/ from regexs that are not too generic or short. --- src/Fixtures/Crawlers.php | 74 +++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 2f1b6b9e..5cf5095e 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -69,7 +69,7 @@ class Crawlers extends AbstractProvider 'AddThis', 'ADmantX', 'adressendeutschland', - 'adscanner\/', + 'adscanner', 'agentslug', 'AHC', 'aihit', @@ -88,16 +88,16 @@ class Crawlers extends AbstractProvider 'Anarchie', 'AndroidDownloadManager', 'Anemone', - 'AngleSharp\/', + 'AngleSharp', 'annotate_google', 'Ant\.com', 'Anturis Agent', 'AnyEvent-HTTP\/', 'Apache Droid', 'Apache OpenOffice', - 'Apache-HttpAsyncClient\/', - 'Apache-HttpClient\/', - 'ApacheBench\/', + 'Apache-HttpAsyncClient', + 'Apache-HttpClient', + 'ApacheBench', 'Apexoo', 'APIs-Google', 'AportWorm\/', @@ -139,8 +139,8 @@ class Crawlers extends AbstractProvider 'BBBike', 'BCKLINKS', 'BDFetch', - 'BegunAdvertising\/', - 'Bidtellect\/', + 'BegunAdvertising', + 'Bidtellect', 'BigBozz', 'Bigfoot', 'biglotron', @@ -154,7 +154,7 @@ class Crawlers extends AbstractProvider 'Blackboard Safeassign', 'BlackWidow', 'BlockNote\.Net', - 'Bloglines\/', + 'Bloglines', 'Bloglovin', 'BlogPulseLive', 'BlogSearch', @@ -166,7 +166,7 @@ class Crawlers extends AbstractProvider 'Branch Metrics API', 'Branch-Passthrough', 'Brandprotect', - 'BrandVerity\/', + 'BrandVerity', 'Brandwatch', 'Brodie\/', 'Browsershots', @@ -192,7 +192,7 @@ class Crawlers extends AbstractProvider 'CERT\.at-Statistics-Survey', 'cg-eye', 'changedetection', - 'ChangesMeter\/', + 'ChangesMeter', 'Charlotte', 'CheckHost', 'checkprivacy', @@ -202,7 +202,7 @@ class Crawlers extends AbstractProvider 'chkme\.com', 'Chlooe', 'Chromaxa', - 'CirrusExplorer\/', + 'CirrusExplorer', 'CISPA Vulnerability Notification', 'Citoid', 'CJNetworkQuality', @@ -211,7 +211,7 @@ class Crawlers extends AbstractProvider 'Cloud\ mapping', 'CloudEndure', 'CloudFlare-AlwaysOnline', - 'Cloudinary\/', + 'Cloudinary', 'cmcm\.com', 'coccoc', 'cognitiveseo', @@ -261,7 +261,7 @@ class Crawlers extends AbstractProvider 'DMBrowser', 'DNSPod-reporting', 'docoloc', - 'Dolphin http client\/', + 'Dolphin http client', 'DomainAppender', 'Donuts Content Explorer', 'dotMailer content retrieval', @@ -348,14 +348,14 @@ class Crawlers extends AbstractProvider 'Flunky', 'flynxapp', 'forensiq', - 'FoundSeoTool\/', + 'FoundSeoTool', 'http:\/\/www.neomo.de\/', //'Francis [Bot]' 'free thumbnails', 'Freeuploader', 'Funnelback', 'G-i-g-a-b-o-t', 'g00g1e\.net', - 'ganarvisitas\/', + 'ganarvisitas', 'geek-tools', 'Genieo', 'GentleSource', @@ -371,7 +371,7 @@ class Crawlers extends AbstractProvider 'GigablastOpenSource', 'GIS-LABS', 'github-camo', - 'github\.com\/', + 'github\.com', 'Go [\d\.]* package http', 'Go http package', 'Go-Ahead-Got-It', @@ -439,15 +439,15 @@ class Crawlers extends AbstractProvider 'HeartRails_Capture', 'help@dataminr\.com', 'heritrix', - 'historious\/', + 'historious', 'hkedcity', 'hledejLevne\.cz', 'Hloader', 'HMView', 'Holmes', - 'HonesoSearchEngine\/', + 'HonesoSearchEngine', 'HootSuite Image proxy', - 'Hootsuite-WebFeed\/', + 'Hootsuite-WebFeed', 'hosterstats', 'HostTracker', 'ht:\/\/check', @@ -626,7 +626,7 @@ class Crawlers extends AbstractProvider 'mediawords', 'MegaIndex\.ru', 'MeltwaterNews', - 'Melvil Rawi\/', + 'Melvil Rawi', 'Metaspinner', 'MetaURI', 'MFC_Tear_Sample', @@ -639,7 +639,7 @@ class Crawlers extends AbstractProvider 'MIDown\ tool', 'MIIxpc', 'Mindjet', - 'Miniature\.io\/', + 'Miniature\.io', 'Miniflux', 'Mister\ PiX', 'mixdata dot com', @@ -669,7 +669,7 @@ class Crawlers extends AbstractProvider 'MVAClient', 'MxToolbox\/', 'nagios', - 'Najdi\.si\/', + 'Najdi\.si', 'Name\ Intelligence', 'Nameprotect', 'Navroad', @@ -705,7 +705,7 @@ class Crawlers extends AbstractProvider 'NLNZ_IAHarvester', 'Nmap Scripting Engine', 'node-superagent', - 'node-urllib\/', + 'node-urllib', 'node\.io', 'Nodemeter', 'NodePing', @@ -724,7 +724,7 @@ class Crawlers extends AbstractProvider 'oegp', 'Offline Explorer', 'Offline\ Navigator', - 'og-scraper\/', + 'og-scraper', 'okhttp', 'omgili', 'OMSC', @@ -742,7 +742,7 @@ class Crawlers extends AbstractProvider 'ow\.ly', 'Owler', 'ownCloud News', - 'OxfordCloudService\/', + 'OxfordCloudService', 'Page Valet', 'page_verifier', 'page\ scorer', @@ -774,7 +774,7 @@ class Crawlers extends AbstractProvider 'Picsearch', 'PictureFinder', 'Pimonster', - 'ping\.blo\.gs\/', + 'ping\.blo\.gs', 'Pingability', 'PingAdmin\.Ru', 'Pingdom', @@ -795,13 +795,13 @@ class Crawlers extends AbstractProvider 'Porkbun', 'Port Monitor', 'postano', - 'PostmanRuntime\/', + 'PostmanRuntime', 'PostPost', 'postrank', 'PowerPoint\/', 'Priceonomics Analysis Engine', 'PrintFriendly', - 'PritTorrent\/', + 'PritTorrent', 'Prlog', 'probethenet', 'Project 25499', @@ -844,12 +844,12 @@ class Crawlers extends AbstractProvider 'ReGet', 'RepoMonkey', 'request\.js', - 'ResponseCodeTest\/', + 'ResponseCodeTest', 'RestSharp', 'Riddler', 'Rival IQ', 'Robosourcer', - 'Robozilla\/', + 'Robozilla', 'ROI Hunter', 'RPT-HTTPClient', 'RSSOwl', @@ -870,11 +870,11 @@ class Crawlers extends AbstractProvider 'SimpleScraper', 'Scrapy', 'Screaming', - 'ScreenShotService\/', + 'ScreenShotService', 'Scrubby', 'Scrutiny\/', 'search\.thunderstone', - 'Search37\/', + 'Search37', 'Searchestate', 'SearchExpress', 'SearchSight', @@ -894,7 +894,7 @@ class Crawlers extends AbstractProvider 'SEOkicks', 'Seomoz', 'SEOprofiler', - 'SEOsearch\/', + 'SEOsearch', 'seoscanners', 'SEOstats', 'servernfo', @@ -1003,7 +1003,7 @@ class Crawlers extends AbstractProvider 'teoma', 'terrainformatica', 'Test Certificate Info', - 'Tetrahedron\/', + 'Tetrahedron', 'The Drop Reaper', 'The Expert HTML Source Viewer', 'The Knowledge AI', @@ -1108,7 +1108,7 @@ class Crawlers extends AbstractProvider 'WebCookies', 'WebCopier', 'WebCorp', - 'WebDataStats\/', + 'WebDataStats', 'WebDoc', 'WebEnhancer', 'WebFetch', @@ -1130,7 +1130,7 @@ class Crawlers extends AbstractProvider 'Website\ Quester', 'websitepulse agent', 'WebsiteQuester', - 'Websnapr\/', + 'Websnapr', 'Webster', 'WebStripper', 'WebSucker', @@ -1176,7 +1176,7 @@ class Crawlers extends AbstractProvider 'x22Mozilla', 'XaxisSemanticsClassifier', 'Xenu Link Sleuth', - 'XING-contenttabreceiver\/', + 'XING-contenttabreceiver', 'xpymep([0-9]?)\.exe', 'Y!J-(ASR|BSC)', 'Y\!J-BRW', From 0935d1eb9932f1109c740ae0aa1d9c0c6dc0c8a5 Mon Sep 17 00:00:00 2001 From: Max King Date: Sun, 6 Jan 2019 18:11:13 +0000 Subject: [PATCH 10/12] Reduce length of 2 very long regexs --- src/Fixtures/Crawlers.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index 5cf5095e..d41329b2 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -504,7 +504,7 @@ class Crawlers extends AbstractProvider 'infegy', 'infohelfer', 'InfoTekies', - 'InfoWizards Reciprocal Link System PRO', + 'InfoWizards Reciprocal Link', 'inpwrd\.com', 'instabid', 'Instapaper', @@ -805,7 +805,6 @@ class Crawlers extends AbstractProvider 'Prlog', 'probethenet', 'Project 25499', - 'Promotion_Tools_www\.searchenginepromotionhelp\.com', 'prospectb2b', 'Protopage', 'ProWebWalker', @@ -875,6 +874,7 @@ class Crawlers extends AbstractProvider 'Scrutiny\/', 'search\.thunderstone', 'Search37', + 'searchenginepromotionhelp', 'Searchestate', 'SearchExpress', 'SearchSight', From 18ea21d0081999d2f464835f63beb52ba04483fc Mon Sep 17 00:00:00 2001 From: Max King Date: Mon, 7 Jan 2019 19:43:11 +0000 Subject: [PATCH 11/12] Remove escaped whitespace --- src/Fixtures/Crawlers.php | 78 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/Fixtures/Crawlers.php b/src/Fixtures/Crawlers.php index d41329b2..2b2415b5 100644 --- a/src/Fixtures/Crawlers.php +++ b/src/Fixtures/Crawlers.php @@ -133,7 +133,7 @@ class Crawlers extends AbstractProvider 'Bandit', 'basicstate', 'BatchFTP', - 'Battleztar\ Bazinga', + 'Battleztar Bazinga', 'baypup\/', 'BazQux', 'BBBike', @@ -150,7 +150,7 @@ class Crawlers extends AbstractProvider 'biNu image cacher', 'Bitacle', 'biz_Directory', - 'Black\ Hole', + 'Black Hole', 'Blackboard Safeassign', 'BlackWidow', 'BlockNote\.Net', @@ -208,7 +208,7 @@ class Crawlers extends AbstractProvider 'CJNetworkQuality', 'Clarsentia', 'clips\.ua\.ac\.be', - 'Cloud\ mapping', + 'Cloud mapping', 'CloudEndure', 'CloudFlare-AlwaysOnline', 'Cloudinary', @@ -267,13 +267,13 @@ class Crawlers extends AbstractProvider 'dotMailer content retrieval', 'dotSemantic', 'downforeveryoneorjustme', - 'Download\ Wonder', + 'Download Wonder', 'downnotifier', 'DowntimeDetector', 'Drip', 'drupact', 'Drupal \(\+http:\/\/drupal\.org\/\)', - 'DTS\ Agent', + 'DTS Agent', 'dubaiindex', 'EARTHCOM', 'Easy-Thumb', @@ -289,7 +289,7 @@ class Crawlers extends AbstractProvider 'ElectricMonk', 'elefent', 'EMail Exractor', - 'EMail\ Wolf', + 'EMail Wolf', 'EmailWolf', 'Embarcadero', 'Embed PHP Library', @@ -307,7 +307,7 @@ class Crawlers extends AbstractProvider 'exif', 'Exploratodo', 'Express WebPictures', - 'Extreme\ Picture\ Finder', + 'Extreme Picture Finder', 'EyeNetIE', 'ezooms', 'facebookexternalhit', @@ -489,8 +489,8 @@ class Crawlers extends AbstractProvider 'Iframely', 'igdeSpyder', 'IlTrovatore', - 'Image\ Fetch', - 'Image\ Sucker', + 'Image Fetch', + 'Image Sucker', 'ImageEngine\/', 'ImageVisu\/', 'Imagga', @@ -499,7 +499,7 @@ class Crawlers extends AbstractProvider 'InAGist', 'inbound\.li parser', 'InDesign%20CC', - 'Indy\ Library', + 'Indy Library', 'InetURL', 'infegy', 'infohelfer', @@ -513,7 +513,7 @@ class Crawlers extends AbstractProvider 'Intelliseek', 'InterGET', 'internet_archive', - 'Internet\ Ninja', + 'Internet Ninja', 'InternetSeer', 'internetVista monitor', 'intraVnews', @@ -554,7 +554,7 @@ class Crawlers extends AbstractProvider 'Kelny\/', 'Kerrigan\/', 'KeyCDN', - 'Keyword\ Density', + 'Keyword Density', 'Keywords Research', 'KickFire', 'KimonoLabs\/', @@ -619,9 +619,9 @@ class Crawlers extends AbstractProvider 'marketinggrader', 'MarkMonitor', 'MarkWatch', - 'Mass\ Downloader', + 'Mass Downloader', 'masscan\/', - 'Mata\ Hari', + 'Mata Hari', 'Mediapartners-Google', 'mediawords', 'MegaIndex\.ru', @@ -635,13 +635,13 @@ class Crawlers extends AbstractProvider 'Microsoft Outlook', 'Microsoft Windows Network Diagnostics', 'Microsoft-WebDAV-MiniRedir', - 'Microsoft\ Data\ Access', - 'MIDown\ tool', + 'Microsoft Data Access', + 'MIDown tool', 'MIIxpc', 'Mindjet', 'Miniature\.io', 'Miniflux', - 'Mister\ PiX', + 'Mister PiX', 'mixdata dot com', 'mixed-content-scan', 'Mixmax-LinkPreview', @@ -656,12 +656,12 @@ class Crawlers extends AbstractProvider 'montastic', 'MonTools', 'Moreover', - 'Morfeus\ Fucking\ Scanner', + 'Morfeus Fucking Scanner', 'Morning Paper', 'MovableType', 'mowser', 'Mrcgiguy', - 'MS\ Web\ Services\ Client\ Protocol', + 'MS Web Services Client Protocol', 'MSFrontPage', 'mShots', 'MuckRack\/', @@ -670,13 +670,13 @@ class Crawlers extends AbstractProvider 'MxToolbox\/', 'nagios', 'Najdi\.si', - 'Name\ Intelligence', + 'Name Intelligence', 'Nameprotect', 'Navroad', 'NearSite', 'Needle', 'Nessus', - 'Net\ Vampire', + 'Net Vampire', 'NetAnts', 'NETCRAFT', 'NetLyzer', @@ -723,7 +723,7 @@ class Crawlers extends AbstractProvider 'Octopus', 'oegp', 'Offline Explorer', - 'Offline\ Navigator', + 'Offline Navigator', 'og-scraper', 'okhttp', 'omgili', @@ -745,7 +745,7 @@ class Crawlers extends AbstractProvider 'OxfordCloudService', 'Page Valet', 'page_verifier', - 'page\ scorer', + 'page scorer', 'page2rss', 'PageGrabber', 'PagePeeker', @@ -753,7 +753,7 @@ class Crawlers extends AbstractProvider 'Pagespeed\/', 'Panopta', 'panscient', - 'Papa\ Foto', + 'Papa Foto', 'parsijoo', 'Pavuk', 'PayPal IPN', @@ -823,7 +823,7 @@ class Crawlers extends AbstractProvider 'QrafterPro', 'Qseero', 'Qualidator', - 'QueryN\ Metasearch', + 'QueryN Metasearch', 'queuedriver', 'Quora Link Preview', 'Qwantify', @@ -912,7 +912,7 @@ class Crawlers extends AbstractProvider 'Siphon', 'SISTRIX', 'Site-Shot\/', - 'Site\ Sucker', + 'Site Sucker', 'Site24x7', 'SiteBar', 'Sitebeam', @@ -1007,7 +1007,7 @@ class Crawlers extends AbstractProvider 'The Drop Reaper', 'The Expert HTML Source Viewer', 'The Knowledge AI', - 'The\ Intraformant', + 'The Intraformant', 'theinternetrules', 'TheNomad', 'Thinklab', @@ -1057,11 +1057,11 @@ class Crawlers extends AbstractProvider 'urlresolver', 'Urlstat', 'UrlTrends Ranking Updater', - 'URLy\ Warning', + 'URLy Warning', 'URLy\.Warning', 'Vacuum', 'Vagabondo', - 'VB\ Project', + 'VB Project', 'vBSEO', 'VCI', 'via ggpht\.com GoogleImageProxy', @@ -1091,14 +1091,14 @@ class Crawlers extends AbstractProvider 'WDT\.io', 'web-capture\.net', 'Web-sniffer', - 'Web\ Auto', - 'Web\ Collage', - 'Web\ Enhancer', - 'Web\ Fetch', - 'Web\ Fuck', - 'Web\ Pix', - 'Web\ Sauger', - 'Web\ Sucker', + 'Web Auto', + 'Web Collage', + 'Web Enhancer', + 'Web Fetch', + 'Web Fuck', + 'Web Pix', + 'Web Sauger', + 'Web Sucker', 'Webalta', 'Webauskunft', 'WebAuto', @@ -1113,7 +1113,7 @@ class Crawlers extends AbstractProvider 'WebEnhancer', 'WebFetch', 'WebFuck', - 'WebGo\ IS', + 'WebGo IS', 'WebImageCollector', 'WebImages', 'WebIndex', @@ -1127,7 +1127,7 @@ class Crawlers extends AbstractProvider 'webscreenie', 'Webshag', 'Webshot', - 'Website\ Quester', + 'Website Quester', 'websitepulse agent', 'WebsiteQuester', 'Websnapr', From e172821bf7dc2f1e614159f096e698528ff737e5 Mon Sep 17 00:00:00 2001 From: Max King Date: Mon, 7 Jan 2019 19:59:22 +0000 Subject: [PATCH 12/12] Run export script --- raw/Crawlers.json | 2 +- raw/Crawlers.txt | 275 +++++++++++++++++++--------------------------- 2 files changed, 116 insertions(+), 161 deletions(-) diff --git a/raw/Crawlers.json b/raw/Crawlers.json index 4396811d..551fbbe3 100644 --- a/raw/Crawlers.json +++ b/raw/Crawlers.json @@ -1 +1 @@ -[".*Java.*outbrain"," YLT","^b0t$","^bluefish ","^Calypso v\\\/","^COMODO DCV","^DangDang","^DavClnt","^FDM ","^git\\\/","^Goose\\\/","^Grabber","^HTTPClient\\\/","^Java\\\/","^Jeode\\\/","^Jetty\\\/","^Mail\\\/","^Mget","^Microsoft URL Control","^NG\\\/[0-9\\.]","^NING\\\/","^PHP\\\/[0-9]","^RMA\\\/","^Ruby|Ruby\\\/[0-9]","^VSE\\\/[0-9]","^WordPress\\.com","^XRL\\\/[0-9]","^ZmEu","008\\\/","13TABS","192\\.comAgent","2ip\\.ru","404checker","404enemy","7Siters","80legs","a\\.pr-cy\\.ru","a3logics\\.in","A6-Indexer","Abonti","Aboundex","aboutthedomain","Accoona-AI-Agent","acoon","acrylicapps\\.com\\\/pulp","Acunetix","AdAuth\\\/","adbeat","AddThis","ADmantX","adressendeutschland","adscanner\\\/","Advanced Email Extractor v","agentslug","AHC","aihit","aiohttp\\\/","Airmail","Akamai_Site_Analyzer","akka-http\\\/","akula\\\/","alertra","alexa site audit","Alibaba\\.Security\\.Heimdall","Alligator","allloadin\\.com","AllSubmitter","alyze\\.info","amagit","Anarchie","AndroidDownloadManager","Anemone","AngleSharp\\\/","annotate_google","Ant\\.com","Anturis Agent","AnyEvent-HTTP\\\/","Apache Droid","Apache OpenOffice","Apache-HttpAsyncClient\\\/","Apache-HttpClient\\\/","ApacheBench\\\/","Apexoo","APIs-Google","AportWorm\\\/[0-9]","AppBeat\\\/[0-9]","AppEngine-Google","AppStoreScraperZ","Aprc\\\/[0-9]","Arachmo","arachnode","Arachnophilia","aria2","Arukereso","asafaweb\\.com","AskQuickly","Ask Jeeves","ASPSeek","Asterias","Astute","asynchttp","Attach","autocite","Autonomy","axios\\\/","B-l-i-t-z-B-O-T","Backlink-Ceck","backlink-check","BacklinkHttpStatus","BackStreet","BackWeb","Bad-Neighborhood","Badass","baidu\\.com","Bandit","basicstate","BatchFTP","Battleztar\\ Bazinga","baypup\\\/[0-9]","baypup\\\/colbert","BazQux","BBBike","BCKLINKS","BDFetch","BegunAdvertising\\\/","Bidtellect\\\/","BigBozz","Bigfoot","biglotron","BingLocalSearch","BingPreview","binlar","biNu image cacher","Bitacle","biz_Directory","Black\\ Hole","Blackboard Safeassign","BlackWidow","BlockNote\\.Net","Bloglines\\\/","Bloglovin","BlogPulseLive","BlogSearch","Blogtrottr","BlowFish","Boardreader","boitho\\.com-dc","BPImageWalker","Braintree-Webhooks","Branch Metrics API","Branch-Passthrough","Brandprotect","BrandVerity\\\/[0-9]","Brandwatch","Brodie\\\/","Browsershots","BUbiNG","Buck\\\/","Buddy","BuiltWith","Bullseye","BunnySlippers","Burf Search","Butterfly\\\/","BuzzSumo","CAAM\\\/[0-9]","CakePHP","Calculon","Canary%20Mail","CapsuleChecker","CaretNail","catexplorador","CC Metadata Scaper","Cegbfeieh","censys","Cerberian Drtrs","CERT\\.at-Statistics-Survey","cg-eye","changedetection","ChangesMeter\\\/","Charlotte","CheckHost","checkprivacy","CherryPicker","ChinaClaw","Chirp\\\/[0-9]","chkme\\.com","Chlooe","Chromaxa","CirrusExplorer\\\/","CISPA Vulnerability Notification","Citoid","CJNetworkQuality","Clarsentia","clips\\.ua\\.ac\\.be","Cloud\\ mapping","CloudEndure","CloudFlare-AlwaysOnline","Cloudinary\\\/[0-9]","cmcm\\.com","coccoc","cognitiveseo","colly -","CommaFeed","Commons-HttpClient","Comodo SSL Checker","contactbigdatafr","contentkingapp","convera","CookieReports\\.com","copyright sheriff","CopyRightCheck","Copyscape","Cosmos4j\\.feedback","Covario-IDS","Crescent","Crowsnest","Criteo","CSHttp","curb","Curious George","curl","cuwhois\\\/[0-9]","cybo\\.com","DAP\\\/NetHTTP","DareBoost","DatabaseDriverMysqli","DataCha0s","Datafeedwatch","Datanyze","DataparkSearch","dataprovider","DataXu","Daum(oa)?[ \\\/][0-9]","DemandbasePublisherAnalyzer\\\/","Demon","DeuSu","developers\\.google\\.com\\\/\\+\\\/web\\\/snippet\\\/","Devil","Digg","Digincore","DigitalPebble","Dirbuster","Disqus\\\/","Dispatch\\\/","DittoSpyder","dlvr","DMBrowser","DNS-Tools Header-Analyzer","DNSPod-reporting","docoloc","Dolphin http client\\\/","DomainAppender","Donuts Content Explorer","dotMailer content retrieval","dotSemantic","downforeveryoneorjustme","Download\\ Wonder","downnotifier\\.com","DowntimeDetector","Dragonfly File Reader","Drip","drupact","Drupal \\(\\+http:\\\/\\\/drupal\\.org\\\/\\)","DTS\\ Agent","dubaiindex","EARTHCOM","Easy-Thumb","EasyDL","Ebingbong","ec2linkfinder","eCairn-Grabber","eCatch","ECCP","eContext\\\/","Ecxi","EirGrabber","ElectricMonk","elefent","EMail Exractor","EMail\\ Wolf","Email%20Extractor","EmailWolf","Embarcadero","Embed PHP Library","Embedly","endo\\\/","europarchive\\.org","evc-batch","EventMachine HttpClient","Everwall Link Expander","Evidon","Evrinid","ExactSearch","ExaleadCloudview","Excel\\\/","exif","Exploratodo","Express WebPictures","ExtractorPro","Extreme\\ Picture\\ Finder","EyeNetIE","ezooms","facebookexternalhit","facebookplatform","fairshare","Faraday v","fasthttp","Faveeo","Favicon downloader","faviconkit","FavOrg","Feed Wrangler","Feedable\\\/","Feedbin","FeedBooster","FeedBucket","FeedBunch\\\/[0-9]","FeedBurner","FeedChecker","Feedly","Feedreader","FeedshowOnline","Feedspot","Feedwind\\\/[0-9]","FeedZcollector","feeltiptop","Fetch API","Fetch\\\/[0-9]","Fever\\\/[0-9]","FHscan","Fimap","findlink","findthatfile","FlashGet","FlipboardBrowserProxy","FlipboardProxy","FlipboardRSS","Flock\\\/","fluffy","Flunky","flynxapp","forensiq","FoundSeoTool\\\/[0-9]","http:\\\/\\\/www.neomo.de\\\/","free thumbnails","Freeuploader","FreeWebMonitoring SiteChecker","Funnelback","G-i-g-a-b-o-t","g00g1e\\.net","GAChecker","ganarvisitas\\\/[0-9]","geek-tools","Genderanalyzer","Genieo","GentleSource","GetCode","Getintent","GetLinkInfo","getprismatic\\.com","GetRight","getroot","GetURLInfo\\\/[0-9]","GetWeb","Ghost Inspector","GigablastOpenSource","GIS-LABS","github-camo","github\\.com\\\/","Go [\\d\\.]* package http","Go http package","Go-Ahead-Got-It","Go-http-client","Go!Zilla","gobyus","gofetch","GomezAgent","gooblog","Goodzer\\\/[0-9]","Google AppsViewer","Google Desktop","Google favicon","Google Keyword Suggestion","Google Keyword Tool","Google Page Speed Insights","Google PP Default","Google Search Console","Google Web Preview","google_partner_monitoring","Google-Adwords","Google-Apps-Script","Google-Calendar-Importer","Google-HotelAdsVerifier","Google-HTTP-Java-Client","Google-Publisher-Plugin","Google-SearchByImage","Google-Site-Verification","Google-Structured-Data-Testing-Tool","Google-Youtube-Links","google-xrawler","GoogleCloudMonitoring","GoogleDocs","GoogleHC\\\/","GoogleProducer","GoogleSites","Google-Transparency-Report","Gookey","GoScraper","GoSpotCheck","GoSquared-Status-Checker","gosquared-thumbnailer","Gotit","GoZilla","grabify","GrabNet","Grafula","Grammarly","GrapeFX","Gregarius","GRequests","grokkit","grouphigh","grub-client","gSOAP\\\/","GT::WWW","GTmetrix","GuzzleHttp","gvfs\\\/","HAA(A)?RTLAND http client","Haansoft","hackney\\\/","Hadi Agent","Hatena","Havij","hawkReader","HeadlessChrome","HEADMasterSEO","HeartRails_Capture","help@dataminr\\.com","heritrix","historious\\\/","hkedcity","hledejLevne\\.cz\\\/[0-9]","Hloader","HMView","Holmes","HonesoSearchEngine\\\/","HootSuite Image proxy","Hootsuite-WebFeed\\\/[0-9]","hosterstats","HostTracker","ht:\\\/\\\/check","htdig","HTMLparser","htmlyse\\.com","HTTP Banner Detection","HTTP_Compression_Test","http_request2","http_requester","http-get","HTTP-Header-Abfrage","http-kit","http-request\\\/","HTTP-Tiny","HTTP::Lite","http\\.rb\\\/","http_get","HttpComponents","httphr","HTTPMon","httpRequest","httpscheck","httpssites_power","httpunit","HttpUrlConnection","httrack","huaweisymantec","HubSpot ","Humanlinks","HyperZbozi\\.cz Feeder","i2kconnect\\\/","Iblog","ichiro","Id-search","IdeelaborPlagiaat","IDG Twitter Links Resolver","IDwhois\\\/[0-9]","Iframely","igdeSpyder","IlTrovatore","Image\\ Fetch","Image\\ Sucker","ImageEngine\\\/","ImageVisu\\\/","Imagga","imagineeasy","imgsizer","InAGist","inbound\\.li parser","InDesign%20CC","Indy\\ Library","InetURL","infegy","infohelfer","InfoTekies","InfoWizards Reciprocal Link System PRO","inpwrd\\.com","instabid","Instapaper","Integrity","integromedb","Intelliseek","InterGET","internet_archive","Internet\\ Ninja","InternetSeer","internetVista monitor","intraVnews","IODC","IOI","iplabel","ips-agent","IPS\\\/[0-9]","IPWorks HTTP\\\/S Component","iqdb\\\/","Iria","Irokez","isitup\\.org","iskanie","isUp\\.li","iThemes Sync\\\/[0-9]","iZSearch","JAHHO","janforman","Jaunt\\\/","Jbrofuzz","Jersey\\\/","JetCar","Jigsaw","Jobboerse","JobFeed discovery","Jobg8 URL Monitor","jobo","Jobrapido","Jobsearch1\\.5","JoinVision Generic","JolokiaPwn","Joomla","Jorgee","JS-Kit","JustView","Kaspersky Lab CFR link resolver","KeepRight OpenStreetMap Checker","Kelny\\\/","Kerrigan\\\/","KeyCDN","Keyword Extractor","Keyword\\ Density","Keywords Research","KickFire","KimonoLabs\\\/","Kml-Google","knows\\.is","KOCMOHABT","kouio","kube-probe","kulturarw3","KumKie","L\\.webis","Larbin","Lavf\\\/","LayeredExtractor","LeechFTP","LeechGet","letsencrypt","Lftp","LibVLC","LibWeb","Libwhisker","libwww","Licorne","Liferea\\\/","Lightspeedsystems","Likse","link checker","Link Valet","link_thumbnailer","LinkAlarm\\\/","linkCheck","linkdex","LinkExaminer","linkfluence","linkpeek","LinkPreviewGenerator","LinkScan","LinksManager","LinkTiger","LinkWalker","Lipperhey","Litemage_walker","livedoor ScreenShot","LoadImpactRload","localsearch-web","LongURL API","looksystems\\.net","ltx71","lua-resty-http","lwp-request","lwp-trivial","LWP::Simple","lycos","LYT\\.SR","mabontland","Mag-Net","MagpieRSS","Mail\\.Ru","MailChimp","Majestic12","makecontact\\\/","Mandrill","MapperCmd","marketinggrader","MarkMonitor","MarkWatch","Mass\\ Downloader","masscan\\\/[0-9]","Mata\\ Hari","Mediapartners-Google","mediawords","MegaIndex\\.ru","MeltwaterNews","Melvil Rawi\\\/","MergeFlow-PageReader","Metaspinner","MetaURI","MFC_Tear_Sample","Microsearch","Microsoft Office ","Microsoft Outlook","Microsoft Windows Network Diagnostics","Microsoft-WebDAV-MiniRedir","Microsoft\\ Data\\ Access","MIDown\\ tool","MIIxpc","Mindjet","Miniature\\.io\\\/","Miniflux","Mister\\ PiX","mixdata dot com","mixed-content-scan","Mixmax-LinkPreview","mixnode","Mnogosearch","mogimogi","Mojeek","Mojolicious \\(Perl\\)","Monit\\\/","monitis","Monitority\\\/[0-9]","montastic","MonTools","Moreover","Morfeus\\ Fucking\\ Scanner","Morning Paper","MovableType","mowser","Mrcgiguy","MS\\ Web\\ Services\\ Client\\ Protocol","MSFrontPage","mShots","MuckRack\\\/","muhstik-scan","MVAClient","MxToolbox\\\/","nagios","Najdi\\.si\\\/","Name\\ Intelligence","Nameprotect","Navroad","NearSite","Needle","Nessus","Net\\ Vampire","NetAnts","NETCRAFT","NetLyzer","NetMechanic","NetNewsWire","Netpursual","netresearch","NetShelter ContentScan","Netsparker","NetTrack","Netvibes","NetZIP","Neustar WPM","NeutrinoAPI","NewRelicPinger","NewsBlur .*Finder","NewsGator","newsme","newspaper\\\/","Nexgate Ruby Client","NG-Search","Nibbler","NICErsPRO","Nikto","nineconnections\\.com","NLNZ_IAHarvester","Nmap Scripting Engine","node-superagent","node-urllib\\\/","node\\.io","Nodemeter","NodePing","nominet\\.org\\.uk","Norton-Safeweb","Notifixious","notifyninja","nuhk","nutch","Nuzzel","nWormFeedFinder","Nymesis","NYU","Ocelli\\\/[0-9]","Octopus","oegp","Offline Explorer","Offline\\ Navigator","og-scraper\\\/","okhttp","Omea Reader","omgili","OMSC","Online Domain Tools","OpenCalaisSemanticProxy","Openfind","OpenLinkProfiler","Openstat\\\/","OpenVAS","Optimizer","Orbiter","OrgProbe\\\/[0-9]","orion-semantics","Outlook-Express","ow\\.ly","Owler","ownCloud News","OxfordCloudService\\\/[0-9]","Page Analyzer","Page Valet","page_verifier","page\\ scorer","page2rss","PageAnalyzer","PageGrabber","PagePeeker","PageScorer","Pagespeed\\\/[0-9]","Panopta","panscient","Papa\\ Foto","parsijoo","Pavuk","PayPal IPN","pcBrowser","Pcore-HTTP","Pearltrees","PECL::HTTP","peerindex","Peew","PeoplePal","Perlu -","PhantomJS Screenshoter","PhantomJS\\\/","Photon\\\/","phpservermon","Pi-Monster","Picscout","Picsearch","PictureFinder","Pimonster","ping\\.blo\\.gs\\\/","Pingability","PingAdmin\\.Ru","Pingdom","Pingoscope","PingSpot","pinterest\\.com","Pixray","Pizilla","Plagger\\\/","Ploetz \\+ Zeller","Plukkie","plumanalytics","PocketImageCache","PocketParser","Pockey","POE-Component-Client-HTTP","Pompos","Porkbun","Port Monitor","postano","PostmanRuntime\\\/","PostPost","postrank","PowerPoint\\\/","Priceonomics Analysis Engine","PrintFriendly\\.com","PritTorrent\\\/[0-9]","Prlog","probethenet","Project 25499","Promotion_Tools_www\\.searchenginepromotionhelp\\.com","prospectb2b","Protopage","ProWebWalker","proximic","PRTG Network Monitor","pshtt, https scanning","PTST ","PTST\\\/[0-9]+","Pulsepoint XT3 web scraper","Pump","Python-httplib2","python-requests","Python-urllib","Qirina Hurdler","QQDownload","QrafterPro","Qseero","Qualidator\\.com SiteAnalyzer","QueryN\\ Metasearch","queuedriver","Quora Link Preview","Qwantify","Radian6","RankActive","RankFlex","RankSonicSiteAuditor","Re-re Studio","Readability","RealDownload","RealPlayer%20Downloader","RebelMouse","Recorder","RecurPost\\\/","redback\\\/","Redirect Checker Tool","ReederForMac","ReGet","RepoMonkey","request\\.js","ResponseCodeTest\\\/[0-9]","RestSharp","Riddler","Rival IQ","Robosourcer","Robozilla\\\/[0-9]","ROI Hunter","RPT-HTTPClient","rss reader","RSSOwl","RssReader\\\/","safe-agent-scanner","SalesIntelligent","Saleslift","Sendsay\\.Ru","SauceNAO","SBIder","scalaj-http","scan\\.lol","ScanAlert","Scoop","scooter","ScoutJet","ScoutURLMonitor","ScrapeBox Page Scanner","SimpleScraper","Scrapy","Screaming","ScreenShotService\\\/[0-9]","Scrubby","Scrutiny\\\/","search\\.thunderstone","Search37\\\/","Searchestate","SearchExpress","SearchSight","Seeker","semanticdiscovery","semanticjuice","Semiocast HTTP client","Semrush","sentry\\\/","SEO Browser","Seo Servis","seo-nastroj\\.cz","seo4ajax","Seobility","SEOCentro","SeoCheck","SEOkicks","Seomoz","SEOprofiler","SEOsearch\\\/","SeopultContentAnalyzer","seoscanners","SEOstats","Server Density Service Monitoring","servernfo\\.com","sexsearcher","Seznam","Shelob","Shodan","Shoppimon Analyzer","ShoppimonAgent\\\/[0-9]","ShopWiki","ShortLinkTranslate","shrinktheweb","Sideqik","SilverReader","SimplePie","SimplyFast","Siphon","SISTRIX","Site-Shot\\\/","Site\\ Sucker","Site24x7","SiteBar","Sitebeam","Sitebulb\\\/","SiteCondor","SiteExplorer","SiteGuardian","Siteimprove","SiteIndexed","Sitemap(s)? Generator","SitemapGenerator","SiteMonitor","Siteshooter B0t","SiteSnagger","SiteSucker","SiteTruth","Sitevigil","sitexy\\.com","SkypeUriPreview","Slack\\\/","slider\\.com","slurp","SlySearch","SmartDownload","SMRF URL Expander","SMUrlExpander","Snake","Snappy","SnapSearch","Snarfer\\\/","SniffRSS","sniptracker","Snoopy","SnowHaze Search","sogou web","SortSite","Sottopop","sovereign\\.ai","SpaceBison","SpamExperts","Spammen","Spanner","spaziodati","SPDYCheck","Specificfeeds","speedy","SPEng","Spinn3r","spray-can","Sprinklr ","spyonweb","sqlmap","Sqlworm","Sqworm","SSL Labs","ssl-tools","StackRambler","Statastico\\\/","StatusCake","Steeler","Stratagems Kumo","Stroke\\.cz","StudioFACA","StumbleUpon","suchen","Sucuri","summify","Super Monitoring","SuperHTTP","Surphace Scout","Suzuran","SwiteScraper","Symfony BrowserKit","Symfony2 BrowserKit","SynHttpClient-Built","Sysomos","sysscan","Szukacz","T0PHackTeam","tAkeOut","Tarantula\\\/","Taringa UGC","TarmotGezgin","Teleport","Telesoft","Telesphoreo","Telesphorep","Tenon\\.io","teoma","terrainformatica\\.com","Test Certificate Info","Tetrahedron\\\/[0-9]","The Drop Reaper","The Expert HTML Source Viewer","The Knowledge AI","The\\ Intraformant","theinternetrules","TheNomad","theoldreader\\.com","Thinklab","Thumbshots","ThumbSniper","timewe\\.net","TinEye","Tiny Tiny RSS","TLSProbe\\\/","Toata","topster","touche\\.com","Traackr\\.com","tracemyfile","TrapitAgent","Trendiction","Trendsmap","trendspottr\\.com","truwoGPS","TryJsoup","TulipChain","Turingos","Turnitin","tweetedtimes\\.com","Tweetminster","Tweezler\\\/","twibble","Twice","Twikle","Twingly","Twisted PageGetter","Typhoeus","ubermetrics-technologies","uclassify","UdmSearch","unirest-java","UniversalFeedParser","Unshorten\\.It","Untiny","UnwindFetchor","updated","updown\\.io daemon","Upflow","Uptimia","URL Verifier","URLChecker","URLitor\\.com","urlresolver","Urlstat","UrlTrends Ranking Updater","URLy\\ Warning","URLy\\.Warning","Vacuum","Vagabondo","VB\\ Project","vBSEO","VCI","via ggpht\\.com GoogleImageProxy","VidibleScraper","Virusdie","visionutils","vkShare","VoidEYE","Voil","voltron","voyager\\\/","VSAgent\\\/[0-9]","VSB-TUO\\\/[0-9]","Vulnbusters Meter","VYU2","w3af\\.org","W3C_I18n-Checker","W3C_Unicorn","W3C-checklink","W3C-mobileOK","WAC-OFU","Wallpapers\\\/[0-9]+","WallpapersHD","wangling","Wappalyzer","WatchMouse","WbSrch\\\/","WDT\\.io","web-capture\\.net","Web-Monitoring","Web-sniffer","Web\\ Auto","Web\\ Collage","Web\\ Enhancer","Web\\ Fetch","Web\\ Fuck","Web\\ Pix","Web\\ Sauger","Web\\ Sucker","Webalta","Webauskunft","WebAuto","WebCapture","WebClient\\\/","webcollage","WebCookies","WebCopier","WebCorp","WebDataStats\\\/[0-9]","WebDoc","WebEnhancer","WebFetch","WebFuck","WebGo\\ IS","WebImageCollector","WebImages","WebIndex","webkit2png","WebLeacher","webmastercoffee","webmon ","WebPix","WebReaper","WebSauger","webscreenie","Webshag","Webshot","Website Analyzer\\\/","Website\\ Quester","WebsiteExtractor","websitepulse agent","websitepulse[+ ]checker","WebsiteQuester","Websnapr\\\/","Webster","WebStripper","WebSucker","Webthumb\\\/[0-9]","WebThumbnail","WebWhacker","WebZIP","WeLikeLinks","WEPA","WeSEE","wf84","Wfuzz\\\/","wget","WhatsApp","WhatsMyIP","WhatWeb","WhereGoes\\?","Whibse","WhoRunsCoinHive","Whynder Magnet","Windows-RSS-Platform","WinPodder","wkhtmlto","wmtips","Woko","Word\\\/","WordPress\\\/","WordupinfoSearch","wotbox","WP Engine Install Performance API","wpif","wprecon\\.com survey","WPScan","wscheck","Wtrace","WWW-Collector-E","WWW-Mechanize","WWW::Document","WWW::Mechanize","www\\.monitor\\.us","WWWOFFLE","x09Mozilla","x22Mozilla","XaxisSemanticsClassifier","Xenu Link Sleuth","XING-contenttabreceiver\\\/[0-9]","xpymep([0-9]?)\\.exe","Y!J-(ASR|BSC)","Y\\!J-BRW","Yaanb","yacy","Yahoo Ad monitoring","Yahoo Link Preview","YahooCacheSystem","YahooYSMcm","YandeG","Yandex(?!Search)","yanga","yeti","Yo-yo","Yoleo Consumer","yoogliFetchAgent","YottaaMonitor","Your-Website-Sucks\\\/[0-9]","yourls\\.org","YoYs\\.net","YP\\.PL","Zabbix","Zade","Zao","Zauba","Zemanta Aggregator","Zend_Http_Client","Zend\\\\Http\\\\Client","Zermelo","Zeus","zgrab","ZnajdzFoto","Zombie\\.js","Zoom\\.Mac","ZyBorg","[a-z0-9\\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron)"] \ No newline at end of file +[".*Java.*outbrain"," YLT","^b0t$","^bluefish ","^Calypso v\\\/","^COMODO DCV","^DangDang","^DavClnt","^FDM ","^git\\\/","^Goose\\\/","^Grabber","^HTTPClient\\\/","^Java\\\/","^Jeode\\\/","^Jetty\\\/","^Mail\\\/","^Mget","^Microsoft URL Control","^NG\\\/[0-9\\.]","^NING\\\/","^PHP\\\/[0-9]","^RMA\\\/","^Ruby|Ruby\\\/[0-9]","^VSE\\\/[0-9]","^WordPress\\.com","^XRL\\\/[0-9]","^ZmEu","008\\\/","13TABS","192\\.comAgent","2ip\\.ru","404enemy","7Siters","80legs","a\\.pr-cy\\.ru","a3logics\\.in","A6-Indexer","Abonti","Aboundex","aboutthedomain","Accoona-AI-Agent","acoon","acrylicapps\\.com\\\/pulp","Acunetix","AdAuth\\\/","adbeat","AddThis","ADmantX","adressendeutschland","adscanner","agentslug","AHC","aihit","aiohttp\\\/","Airmail","akka-http\\\/","akula\\\/","alertra","alexa site audit","Alibaba\\.Security\\.Heimdall","Alligator","allloadin","AllSubmitter","alyze\\.info","amagit","Anarchie","AndroidDownloadManager","Anemone","AngleSharp","annotate_google","Ant\\.com","Anturis Agent","AnyEvent-HTTP\\\/","Apache Droid","Apache OpenOffice","Apache-HttpAsyncClient","Apache-HttpClient","ApacheBench","Apexoo","APIs-Google","AportWorm\\\/","AppBeat\\\/","AppEngine-Google","AppStoreScraperZ","Aprc\\\/[0-9]","Arachmo","arachnode","Arachnophilia","aria2","Arukereso","asafaweb","AskQuickly","Ask Jeeves","ASPSeek","Asterias","Astute","asynchttp","Attach","autocite","Autonomy","axios\\\/","B-l-i-t-z-B-O-T","Backlink-Ceck","backlink-check","BacklinkHttpStatus","BackStreet","BackWeb","Bad-Neighborhood","Badass","baidu\\.com","Bandit","basicstate","BatchFTP","Battleztar Bazinga","baypup\\\/","BazQux","BBBike","BCKLINKS","BDFetch","BegunAdvertising","Bidtellect","BigBozz","Bigfoot","biglotron","BingLocalSearch","BingPreview","binlar","biNu image cacher","Bitacle","biz_Directory","Black Hole","Blackboard Safeassign","BlackWidow","BlockNote\\.Net","Bloglines","Bloglovin","BlogPulseLive","BlogSearch","Blogtrottr","BlowFish","boitho\\.com-dc","BPImageWalker","Braintree-Webhooks","Branch Metrics API","Branch-Passthrough","Brandprotect","BrandVerity","Brandwatch","Brodie\\\/","Browsershots","BUbiNG","Buck\\\/","Buddy","BuiltWith","Bullseye","BunnySlippers","Burf Search","Butterfly\\\/","BuzzSumo","CAAM\\\/[0-9]","CakePHP","Calculon","Canary%20Mail","CaretNail","catexplorador","CC Metadata Scaper","Cegbfeieh","censys","Cerberian Drtrs","CERT\\.at-Statistics-Survey","cg-eye","changedetection","ChangesMeter","Charlotte","CheckHost","checkprivacy","CherryPicker","ChinaClaw","Chirp\\\/","chkme\\.com","Chlooe","Chromaxa","CirrusExplorer","CISPA Vulnerability Notification","Citoid","CJNetworkQuality","Clarsentia","clips\\.ua\\.ac\\.be","Cloud mapping","CloudEndure","CloudFlare-AlwaysOnline","Cloudinary","cmcm\\.com","coccoc","cognitiveseo","colly -","CommaFeed","Commons-HttpClient","contactbigdatafr","contentkingapp","convera","CookieReports","copyright sheriff","CopyRightCheck","Copyscape","Cosmos4j\\.feedback","Covario-IDS","Crescent","Crowsnest","Criteo","CSHttp","curb","Curious George","curl","cuwhois\\\/","cybo\\.com","DAP\\\/NetHTTP","DareBoost","DatabaseDriverMysqli","DataCha0s","Datafeedwatch","Datanyze","DataparkSearch","dataprovider","DataXu","Daum(oa)?[ \\\/][0-9]","Demon","DeuSu","developers\\.google\\.com\\\/\\+\\\/web\\\/snippet\\\/","Devil","Digg","Digincore","DigitalPebble","Dirbuster","Disqus\\\/","Dispatch\\\/","DittoSpyder","dlvr","DMBrowser","DNSPod-reporting","docoloc","Dolphin http client","DomainAppender","Donuts Content Explorer","dotMailer content retrieval","dotSemantic","downforeveryoneorjustme","Download Wonder","downnotifier","DowntimeDetector","Drip","drupact","Drupal \\(\\+http:\\\/\\\/drupal\\.org\\\/\\)","DTS Agent","dubaiindex","EARTHCOM","Easy-Thumb","EasyDL","Ebingbong","ec2linkfinder","eCairn-Grabber","eCatch","ECCP","eContext\\\/","Ecxi","EirGrabber","ElectricMonk","elefent","EMail Exractor","EMail Wolf","EmailWolf","Embarcadero","Embed PHP Library","Embedly","endo\\\/","europarchive\\.org","evc-batch","EventMachine HttpClient","Everwall Link Expander","Evidon","Evrinid","ExactSearch","ExaleadCloudview","Excel\\\/","exif","Exploratodo","Express WebPictures","Extreme Picture Finder","EyeNetIE","ezooms","facebookexternalhit","facebookplatform","fairshare","Faraday v","fasthttp","Faveeo","Favicon downloader","faviconkit","FavOrg","Feed Wrangler","Feedable\\\/","Feedbin","FeedBooster","FeedBucket","FeedBunch\\\/","FeedBurner","Feedly","FeedshowOnline","Feedspot","Feedwind\\\/","FeedZcollector","feeltiptop","Fetch API","Fetch\\\/[0-9]","Fever\\\/[0-9]","FHscan","Fimap","findlink","findthatfile","FlashGet","FlipboardBrowserProxy","FlipboardProxy","FlipboardRSS","Flock\\\/","fluffy","Flunky","flynxapp","forensiq","FoundSeoTool","http:\\\/\\\/www.neomo.de\\\/","free thumbnails","Freeuploader","Funnelback","G-i-g-a-b-o-t","g00g1e\\.net","ganarvisitas","geek-tools","Genieo","GentleSource","GetCode","Getintent","GetLinkInfo","getprismatic","GetRight","getroot","GetURLInfo\\\/","GetWeb","Ghost Inspector","GigablastOpenSource","GIS-LABS","github-camo","github\\.com","Go [\\d\\.]* package http","Go http package","Go-Ahead-Got-It","Go-http-client","Go!Zilla","gobyus","gofetch","GomezAgent","gooblog","Goodzer\\\/","Google AppsViewer","Google Desktop","Google favicon","Google Keyword Suggestion","Google Keyword Tool","Google Page Speed Insights","Google PP Default","Google Search Console","Google Web Preview","Google-Adwords","Google-Apps-Script","Google-Calendar-Importer","Google-HotelAdsVerifier","Google-HTTP-Java-Client","Google-Publisher-Plugin","Google-SearchByImage","Google-Site-Verification","Google-Structured-Data-Testing-Tool","Google-Youtube-Links","google-xrawler","GoogleDocs","GoogleHC\\\/","GoogleProducer","GoogleSites","Google-Transparency-Report","Gookey","GoScraper","GoSpotCheck","gosquared-thumbnailer","Gotit","GoZilla","grabify","GrabNet","Grafula","Grammarly","GrapeFX","Gregarius","GRequests","grokkit","grouphigh","grub-client","gSOAP\\\/","GT::WWW","GTmetrix","GuzzleHttp","gvfs\\\/","HAA(A)?RTLAND http client","Haansoft","hackney\\\/","Hadi Agent","Hatena","Havij","HeadlessChrome","HEADMasterSEO","HeartRails_Capture","help@dataminr\\.com","heritrix","historious","hkedcity","hledejLevne\\.cz","Hloader","HMView","Holmes","HonesoSearchEngine","HootSuite Image proxy","Hootsuite-WebFeed","hosterstats","HostTracker","ht:\\\/\\\/check","htdig","HTMLparser","htmlyse","HTTP Banner Detection","HTTP_Compression_Test","http_request2","http_requester","http-get","HTTP-Header-Abfrage","http-kit","http-request\\\/","HTTP-Tiny","HTTP::Lite","http\\.rb\\\/","http_get","HttpComponents","httphr","HTTPMon","httpRequest","httpscheck","httpssites_power","httpunit","HttpUrlConnection","httrack","huaweisymantec","HubSpot ","Humanlinks","HyperZbozi\\.cz Feeder","i2kconnect\\\/","Iblog","ichiro","Id-search","IdeelaborPlagiaat","IDG Twitter Links Resolver","IDwhois\\\/","Iframely","igdeSpyder","IlTrovatore","Image Fetch","Image Sucker","ImageEngine\\\/","ImageVisu\\\/","Imagga","imagineeasy","imgsizer","InAGist","inbound\\.li parser","InDesign%20CC","Indy Library","InetURL","infegy","infohelfer","InfoTekies","InfoWizards Reciprocal Link","inpwrd\\.com","instabid","Instapaper","Integrity","integromedb","Intelliseek","InterGET","internet_archive","Internet Ninja","InternetSeer","internetVista monitor","intraVnews","IODC","IOI","iplabel","ips-agent","IPS\\\/[0-9]","IPWorks HTTP\\\/S Component","iqdb\\\/","Iria","Irokez","isitup\\.org","iskanie","isUp\\.li","iThemes Sync\\\/","iZSearch","JAHHO","janforman","Jaunt\\\/","Jbrofuzz","Jersey\\\/","JetCar","Jigsaw","Jobboerse","JobFeed discovery","Jobg8 URL Monitor","jobo","Jobrapido","Jobsearch1\\.5","JoinVision Generic","JolokiaPwn","Joomla","Jorgee","JS-Kit","JustView","Kaspersky Lab CFR link resolver","Kelny\\\/","Kerrigan\\\/","KeyCDN","Keyword Density","Keywords Research","KickFire","KimonoLabs\\\/","Kml-Google","knows\\.is","KOCMOHABT","kouio","kube-probe","kulturarw3","KumKie","L\\.webis","Larbin","Lavf\\\/","LeechFTP","LeechGet","letsencrypt","Lftp","LibVLC","LibWeb","Libwhisker","libwww","Licorne","Liferea\\\/","Lightspeedsystems","Likse","Link Valet","link_thumbnailer","LinkAlarm\\\/","linkCheck","linkdex","LinkExaminer","linkfluence","linkpeek","LinkPreviewGenerator","LinkScan","LinksManager","LinkTiger","LinkWalker","Lipperhey","Litemage_walker","livedoor ScreenShot","LoadImpactRload","localsearch-web","LongURL API","looksystems\\.net","ltx71","lua-resty-http","lwp-request","lwp-trivial","LWP::Simple","lycos","LYT\\.SR","mabontland","Mag-Net","MagpieRSS","Mail\\.Ru","MailChimp","Majestic12","makecontact\\\/","Mandrill","MapperCmd","marketinggrader","MarkMonitor","MarkWatch","Mass Downloader","masscan\\\/","Mata Hari","Mediapartners-Google","mediawords","MegaIndex\\.ru","MeltwaterNews","Melvil Rawi","Metaspinner","MetaURI","MFC_Tear_Sample","Microsearch","Microsoft Office ","Microsoft Outlook","Microsoft Windows Network Diagnostics","Microsoft-WebDAV-MiniRedir","Microsoft Data Access","MIDown tool","MIIxpc","Mindjet","Miniature\\.io","Miniflux","Mister PiX","mixdata dot com","mixed-content-scan","Mixmax-LinkPreview","mixnode","Mnogosearch","mogimogi","Mojeek","Mojolicious \\(Perl\\)","Monit\\\/","monitis","Monitority\\\/","montastic","MonTools","Moreover","Morfeus Fucking Scanner","Morning Paper","MovableType","mowser","Mrcgiguy","MS Web Services Client Protocol","MSFrontPage","mShots","MuckRack\\\/","muhstik-scan","MVAClient","MxToolbox\\\/","nagios","Najdi\\.si","Name Intelligence","Nameprotect","Navroad","NearSite","Needle","Nessus","Net Vampire","NetAnts","NETCRAFT","NetLyzer","NetMechanic","NetNewsWire","Netpursual","netresearch","NetShelter ContentScan","Netsparker","NetTrack","Netvibes","NetZIP","Neustar WPM","NeutrinoAPI","NewRelicPinger","NewsBlur .*Finder","NewsGator","newsme","newspaper\\\/","Nexgate Ruby Client","NG-Search","Nibbler","NICErsPRO","Nikto","nineconnections","NLNZ_IAHarvester","Nmap Scripting Engine","node-superagent","node-urllib","node\\.io","Nodemeter","NodePing","nominet\\.org\\.uk","Norton-Safeweb","Notifixious","notifyninja","nuhk","nutch","Nuzzel","nWormFeedFinder","Nymesis","NYU","Ocelli\\\/","Octopus","oegp","Offline Explorer","Offline Navigator","og-scraper","okhttp","omgili","OMSC","Online Domain Tools","OpenCalaisSemanticProxy","Openfind","OpenLinkProfiler","Openstat\\\/","OpenVAS","Optimizer","Orbiter","OrgProbe\\\/","orion-semantics","Outlook-Express","ow\\.ly","Owler","ownCloud News","OxfordCloudService","Page Valet","page_verifier","page scorer","page2rss","PageGrabber","PagePeeker","PageScorer","Pagespeed\\\/","Panopta","panscient","Papa Foto","parsijoo","Pavuk","PayPal IPN","pcBrowser","Pcore-HTTP","Pearltrees","PECL::HTTP","peerindex","Peew","PeoplePal","Perlu -","PhantomJS Screenshoter","PhantomJS\\\/","Photon\\\/","phpservermon","Pi-Monster","Picscout","Picsearch","PictureFinder","Pimonster","ping\\.blo\\.gs","Pingability","PingAdmin\\.Ru","Pingdom","Pingoscope","PingSpot","pinterest\\.com","Pixray","Pizilla","Plagger\\\/","Ploetz \\+ Zeller","Plukkie","plumanalytics","PocketImageCache","PocketParser","Pockey","POE-Component-Client-HTTP","Pompos","Porkbun","Port Monitor","postano","PostmanRuntime","PostPost","postrank","PowerPoint\\\/","Priceonomics Analysis Engine","PrintFriendly","PritTorrent","Prlog","probethenet","Project 25499","prospectb2b","Protopage","ProWebWalker","proximic","PRTG Network Monitor","pshtt, https scanning","PTST ","PTST\\\/[0-9]+","Pulsepoint XT3 web scraper","Pump","Python-httplib2","python-requests","Python-urllib","Qirina Hurdler","QQDownload","QrafterPro","Qseero","Qualidator","QueryN Metasearch","queuedriver","Quora Link Preview","Qwantify","Radian6","RankActive","RankFlex","RankSonicSiteAuditor","Re-re Studio","Readability","RealDownload","RealPlayer%20Downloader","RebelMouse","Recorder","RecurPost\\\/","redback\\\/","ReederForMac","ReGet","RepoMonkey","request\\.js","ResponseCodeTest","RestSharp","Riddler","Rival IQ","Robosourcer","Robozilla","ROI Hunter","RPT-HTTPClient","RSSOwl","safe-agent-scanner","SalesIntelligent","Saleslift","Sendsay\\.Ru","SauceNAO","SBIder","scalaj-http","scan\\.lol","ScanAlert","Scoop","scooter","ScoutJet","ScoutURLMonitor","ScrapeBox Page Scanner","SimpleScraper","Scrapy","Screaming","ScreenShotService","Scrubby","Scrutiny\\\/","search\\.thunderstone","Search37","searchenginepromotionhelp","Searchestate","SearchExpress","SearchSight","Seeker","semanticdiscovery","semanticjuice","Semiocast HTTP client","Semrush","sentry\\\/","SEO Browser","Seo Servis","seo-nastroj\\.cz","seo4ajax","Seobility","SEOCentro","SeoCheck","SEOkicks","Seomoz","SEOprofiler","SEOsearch","seoscanners","SEOstats","servernfo","sexsearcher","Seznam","Shelob","Shodan","Shoppimon","ShopWiki","ShortLinkTranslate","shrinktheweb","Sideqik","SimplePie","SimplyFast","Siphon","SISTRIX","Site-Shot\\\/","Site Sucker","Site24x7","SiteBar","Sitebeam","Sitebulb\\\/","SiteCondor","SiteExplorer","SiteGuardian","Siteimprove","SiteIndexed","Sitemap(s)? Generator","SitemapGenerator","SiteMonitor","Siteshooter B0t","SiteSnagger","SiteSucker","SiteTruth","Sitevigil","sitexy\\.com","SkypeUriPreview","Slack\\\/","slider\\.com","slurp","SlySearch","SmartDownload","SMRF URL Expander","SMUrlExpander","Snake","Snappy","SnapSearch","Snarfer\\\/","SniffRSS","sniptracker","Snoopy","SnowHaze Search","sogou web","SortSite","Sottopop","sovereign\\.ai","SpaceBison","SpamExperts","Spammen","Spanner","spaziodati","SPDYCheck","Specificfeeds","speedy","SPEng","Spinn3r","spray-can","Sprinklr ","spyonweb","sqlmap","Sqlworm","Sqworm","SSL Labs","ssl-tools","StackRambler","Statastico\\\/","StatusCake","Steeler","Stratagems Kumo","Stroke\\.cz","StudioFACA","StumbleUpon","suchen","Sucuri","summify","SuperHTTP","Surphace Scout","Suzuran","SwiteScraper","Symfony BrowserKit","Symfony2 BrowserKit","SynHttpClient-Built","Sysomos","sysscan","Szukacz","T0PHackTeam","tAkeOut","Tarantula\\\/","Taringa UGC","TarmotGezgin","Teleport","Telesoft","Telesphoreo","Telesphorep","Tenon\\.io","teoma","terrainformatica","Test Certificate Info","Tetrahedron","The Drop Reaper","The Expert HTML Source Viewer","The Knowledge AI","The Intraformant","theinternetrules","TheNomad","Thinklab","Thumbshots","ThumbSniper","timewe\\.net","TinEye","Tiny Tiny RSS","TLSProbe\\\/","Toata","topster","touche\\.com","Traackr\\.com","tracemyfile","TrapitAgent","Trendiction","Trendsmap","trendspottr","truwoGPS","TryJsoup","TulipChain","Turingos","Turnitin","tweetedtimes","Tweetminster","Tweezler\\\/","twibble","Twice","Twikle","Twingly","Twisted PageGetter","Typhoeus","ubermetrics-technologies","uclassify","UdmSearch","unirest-java","UniversalFeedParser","Unshorten\\.It","Untiny","UnwindFetchor","updated","updown\\.io daemon","Upflow","Uptimia","URL Verifier","URLitor","urlresolver","Urlstat","UrlTrends Ranking Updater","URLy Warning","URLy\\.Warning","Vacuum","Vagabondo","VB Project","vBSEO","VCI","via ggpht\\.com GoogleImageProxy","VidibleScraper","Virusdie","visionutils","vkShare","VoidEYE","Voil","voltron","voyager\\\/","VSAgent\\\/","VSB-TUO\\\/","Vulnbusters Meter","VYU2","w3af\\.org","W3C_Unicorn","W3C-checklink","W3C-mobileOK","WAC-OFU","Wallpapers\\\/[0-9]+","WallpapersHD","wangling","Wappalyzer","WatchMouse","WbSrch\\\/","WDT\\.io","web-capture\\.net","Web-sniffer","Web Auto","Web Collage","Web Enhancer","Web Fetch","Web Fuck","Web Pix","Web Sauger","Web Sucker","Webalta","Webauskunft","WebAuto","WebCapture","WebClient\\\/","webcollage","WebCookies","WebCopier","WebCorp","WebDataStats","WebDoc","WebEnhancer","WebFetch","WebFuck","WebGo IS","WebImageCollector","WebImages","WebIndex","webkit2png","WebLeacher","webmastercoffee","webmon ","WebPix","WebReaper","WebSauger","webscreenie","Webshag","Webshot","Website Quester","websitepulse agent","WebsiteQuester","Websnapr","Webster","WebStripper","WebSucker","Webthumb\\\/","WebThumbnail","WebWhacker","WebZIP","WeLikeLinks","WEPA","WeSEE","wf84","Wfuzz\\\/","wget","WhatsApp","WhatsMyIP","WhatWeb","WhereGoes\\?","Whibse","WhoRunsCoinHive","Whynder Magnet","Windows-RSS-Platform","WinPodder","wkhtmlto","wmtips","Woko","Word\\\/","WordPress\\\/","WordupinfoSearch","wotbox","WP Engine Install Performance API","wpif","wprecon\\.com survey","WPScan","wscheck","Wtrace","WWW-Collector-E","WWW-Mechanize","WWW::Document","WWW::Mechanize","www\\.monitor\\.us","WWWOFFLE","x09Mozilla","x22Mozilla","XaxisSemanticsClassifier","Xenu Link Sleuth","XING-contenttabreceiver","xpymep([0-9]?)\\.exe","Y!J-(ASR|BSC)","Y\\!J-BRW","Yaanb","yacy","Yahoo Link Preview","YahooCacheSystem","YahooYSMcm","YandeG","Yandex(?!Search)","yanga","yeti","Yo-yo","Yoleo Consumer","yoogliFetchAgent","YottaaMonitor","Your-Website-Sucks","yourls\\.org","YoYs\\.net","YP\\.PL","Zabbix","Zade","Zao","Zauba","Zemanta Aggregator","Zend_Http_Client","Zend\\\\Http\\\\Client","Zermelo","Zeus","zgrab","ZnajdzFoto","Zombie\\.js","Zoom\\.Mac","ZyBorg","[a-z0-9\\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)"] \ No newline at end of file diff --git a/raw/Crawlers.txt b/raw/Crawlers.txt index e2626bf6..f1eb0ef7 100644 --- a/raw/Crawlers.txt +++ b/raw/Crawlers.txt @@ -30,7 +30,6 @@ 13TABS 192\.comAgent 2ip\.ru -404checker 404enemy 7Siters 80legs @@ -49,41 +48,39 @@ adbeat AddThis ADmantX adressendeutschland -adscanner\/ -Advanced Email Extractor v +adscanner agentslug AHC aihit aiohttp\/ Airmail -Akamai_Site_Analyzer akka-http\/ akula\/ alertra alexa site audit Alibaba\.Security\.Heimdall Alligator -allloadin\.com +allloadin AllSubmitter alyze\.info amagit Anarchie AndroidDownloadManager Anemone -AngleSharp\/ +AngleSharp annotate_google Ant\.com Anturis Agent AnyEvent-HTTP\/ Apache Droid Apache OpenOffice -Apache-HttpAsyncClient\/ -Apache-HttpClient\/ -ApacheBench\/ +Apache-HttpAsyncClient +Apache-HttpClient +ApacheBench Apexoo APIs-Google -AportWorm\/[0-9] -AppBeat\/[0-9] +AportWorm\/ +AppBeat\/ AppEngine-Google AppStoreScraperZ Aprc\/[0-9] @@ -92,7 +89,7 @@ arachnode Arachnophilia aria2 Arukereso -asafaweb\.com +asafaweb AskQuickly Ask Jeeves ASPSeek @@ -115,15 +112,14 @@ baidu\.com Bandit basicstate BatchFTP -Battleztar\ Bazinga -baypup\/[0-9] -baypup\/colbert +Battleztar Bazinga +baypup\/ BazQux BBBike BCKLINKS BDFetch -BegunAdvertising\/ -Bidtellect\/ +BegunAdvertising +Bidtellect BigBozz Bigfoot biglotron @@ -133,24 +129,23 @@ binlar biNu image cacher Bitacle biz_Directory -Black\ Hole +Black Hole Blackboard Safeassign BlackWidow BlockNote\.Net -Bloglines\/ +Bloglines Bloglovin BlogPulseLive BlogSearch Blogtrottr BlowFish -Boardreader boitho\.com-dc BPImageWalker Braintree-Webhooks Branch Metrics API Branch-Passthrough Brandprotect -BrandVerity\/[0-9] +BrandVerity Brandwatch Brodie\/ Browsershots @@ -167,7 +162,6 @@ CAAM\/[0-9] CakePHP Calculon Canary%20Mail -CapsuleChecker CaretNail catexplorador CC Metadata Scaper @@ -177,37 +171,36 @@ Cerberian Drtrs CERT\.at-Statistics-Survey cg-eye changedetection -ChangesMeter\/ +ChangesMeter Charlotte CheckHost checkprivacy CherryPicker ChinaClaw -Chirp\/[0-9] +Chirp\/ chkme\.com Chlooe Chromaxa -CirrusExplorer\/ +CirrusExplorer CISPA Vulnerability Notification Citoid CJNetworkQuality Clarsentia clips\.ua\.ac\.be -Cloud\ mapping +Cloud mapping CloudEndure CloudFlare-AlwaysOnline -Cloudinary\/[0-9] +Cloudinary cmcm\.com coccoc cognitiveseo colly - CommaFeed Commons-HttpClient -Comodo SSL Checker contactbigdatafr contentkingapp convera -CookieReports\.com +CookieReports copyright sheriff CopyRightCheck Copyscape @@ -220,7 +213,7 @@ CSHttp curb Curious George curl -cuwhois\/[0-9] +cuwhois\/ cybo\.com DAP\/NetHTTP DareBoost @@ -232,7 +225,6 @@ DataparkSearch dataprovider DataXu Daum(oa)?[ \/][0-9] -DemandbasePublisherAnalyzer\/ Demon DeuSu developers\.google\.com\/\+\/web\/snippet\/ @@ -246,23 +238,21 @@ Dispatch\/ DittoSpyder dlvr DMBrowser -DNS-Tools Header-Analyzer DNSPod-reporting docoloc -Dolphin http client\/ +Dolphin http client DomainAppender Donuts Content Explorer dotMailer content retrieval dotSemantic downforeveryoneorjustme -Download\ Wonder -downnotifier\.com +Download Wonder +downnotifier DowntimeDetector -Dragonfly File Reader Drip drupact Drupal \(\+http:\/\/drupal\.org\/\) -DTS\ Agent +DTS Agent dubaiindex EARTHCOM Easy-Thumb @@ -278,8 +268,7 @@ EirGrabber ElectricMonk elefent EMail Exractor -EMail\ Wolf -Email%20Extractor +EMail Wolf EmailWolf Embarcadero Embed PHP Library @@ -297,8 +286,7 @@ Excel\/ exif Exploratodo Express WebPictures -ExtractorPro -Extreme\ Picture\ Finder +Extreme Picture Finder EyeNetIE ezooms facebookexternalhit @@ -315,14 +303,12 @@ Feedable\/ Feedbin FeedBooster FeedBucket -FeedBunch\/[0-9] +FeedBunch\/ FeedBurner -FeedChecker Feedly -Feedreader FeedshowOnline Feedspot -Feedwind\/[0-9] +Feedwind\/ FeedZcollector feeltiptop Fetch API @@ -341,33 +327,30 @@ fluffy Flunky flynxapp forensiq -FoundSeoTool\/[0-9] +FoundSeoTool http:\/\/www.neomo.de\/ free thumbnails Freeuploader -FreeWebMonitoring SiteChecker Funnelback G-i-g-a-b-o-t g00g1e\.net -GAChecker -ganarvisitas\/[0-9] +ganarvisitas geek-tools -Genderanalyzer Genieo GentleSource GetCode Getintent GetLinkInfo -getprismatic\.com +getprismatic GetRight getroot -GetURLInfo\/[0-9] +GetURLInfo\/ GetWeb Ghost Inspector GigablastOpenSource GIS-LABS github-camo -github\.com\/ +github\.com Go [\d\.]* package http Go http package Go-Ahead-Got-It @@ -377,7 +360,7 @@ gobyus gofetch GomezAgent gooblog -Goodzer\/[0-9] +Goodzer\/ Google AppsViewer Google Desktop Google favicon @@ -387,7 +370,6 @@ Google Page Speed Insights Google PP Default Google Search Console Google Web Preview -google_partner_monitoring Google-Adwords Google-Apps-Script Google-Calendar-Importer @@ -399,7 +381,6 @@ Google-Site-Verification Google-Structured-Data-Testing-Tool Google-Youtube-Links google-xrawler -GoogleCloudMonitoring GoogleDocs GoogleHC\/ GoogleProducer @@ -408,7 +389,6 @@ Google-Transparency-Report Gookey GoScraper GoSpotCheck -GoSquared-Status-Checker gosquared-thumbnailer Gotit GoZilla @@ -433,27 +413,26 @@ hackney\/ Hadi Agent Hatena Havij -hawkReader HeadlessChrome HEADMasterSEO HeartRails_Capture help@dataminr\.com heritrix -historious\/ +historious hkedcity -hledejLevne\.cz\/[0-9] +hledejLevne\.cz Hloader HMView Holmes -HonesoSearchEngine\/ +HonesoSearchEngine HootSuite Image proxy -Hootsuite-WebFeed\/[0-9] +Hootsuite-WebFeed hosterstats HostTracker ht:\/\/check htdig HTMLparser -htmlyse\.com +htmlyse HTTP Banner Detection HTTP_Compression_Test http_request2 @@ -485,12 +464,12 @@ ichiro Id-search IdeelaborPlagiaat IDG Twitter Links Resolver -IDwhois\/[0-9] +IDwhois\/ Iframely igdeSpyder IlTrovatore -Image\ Fetch -Image\ Sucker +Image Fetch +Image Sucker ImageEngine\/ ImageVisu\/ Imagga @@ -499,12 +478,12 @@ imgsizer InAGist inbound\.li parser InDesign%20CC -Indy\ Library +Indy Library InetURL infegy infohelfer InfoTekies -InfoWizards Reciprocal Link System PRO +InfoWizards Reciprocal Link inpwrd\.com instabid Instapaper @@ -513,7 +492,7 @@ integromedb Intelliseek InterGET internet_archive -Internet\ Ninja +Internet Ninja InternetSeer internetVista monitor intraVnews @@ -529,7 +508,7 @@ Irokez isitup\.org iskanie isUp\.li -iThemes Sync\/[0-9] +iThemes Sync\/ iZSearch JAHHO janforman @@ -551,12 +530,10 @@ Jorgee JS-Kit JustView Kaspersky Lab CFR link resolver -KeepRight OpenStreetMap Checker Kelny\/ Kerrigan\/ KeyCDN -Keyword Extractor -Keyword\ Density +Keyword Density Keywords Research KickFire KimonoLabs\/ @@ -570,7 +547,6 @@ KumKie L\.webis Larbin Lavf\/ -LayeredExtractor LeechFTP LeechGet letsencrypt @@ -583,7 +559,6 @@ Licorne Liferea\/ Lightspeedsystems Likse -link checker Link Valet link_thumbnailer LinkAlarm\/ @@ -623,15 +598,14 @@ MapperCmd marketinggrader MarkMonitor MarkWatch -Mass\ Downloader -masscan\/[0-9] -Mata\ Hari +Mass Downloader +masscan\/ +Mata Hari Mediapartners-Google mediawords MegaIndex\.ru MeltwaterNews -Melvil Rawi\/ -MergeFlow-PageReader +Melvil Rawi Metaspinner MetaURI MFC_Tear_Sample @@ -640,13 +614,13 @@ Microsoft Office Microsoft Outlook Microsoft Windows Network Diagnostics Microsoft-WebDAV-MiniRedir -Microsoft\ Data\ Access -MIDown\ tool +Microsoft Data Access +MIDown tool MIIxpc Mindjet -Miniature\.io\/ +Miniature\.io Miniflux -Mister\ PiX +Mister PiX mixdata dot com mixed-content-scan Mixmax-LinkPreview @@ -657,16 +631,16 @@ Mojeek Mojolicious \(Perl\) Monit\/ monitis -Monitority\/[0-9] +Monitority\/ montastic MonTools Moreover -Morfeus\ Fucking\ Scanner +Morfeus Fucking Scanner Morning Paper MovableType mowser Mrcgiguy -MS\ Web\ Services\ Client\ Protocol +MS Web Services Client Protocol MSFrontPage mShots MuckRack\/ @@ -674,14 +648,14 @@ muhstik-scan MVAClient MxToolbox\/ nagios -Najdi\.si\/ -Name\ Intelligence +Najdi\.si +Name Intelligence Nameprotect Navroad NearSite Needle Nessus -Net\ Vampire +Net Vampire NetAnts NETCRAFT NetLyzer @@ -706,11 +680,11 @@ NG-Search Nibbler NICErsPRO Nikto -nineconnections\.com +nineconnections NLNZ_IAHarvester Nmap Scripting Engine node-superagent -node-urllib\/ +node-urllib node\.io Nodemeter NodePing @@ -724,14 +698,13 @@ Nuzzel nWormFeedFinder Nymesis NYU -Ocelli\/[0-9] +Ocelli\/ Octopus oegp Offline Explorer -Offline\ Navigator -og-scraper\/ +Offline Navigator +og-scraper okhttp -Omea Reader omgili OMSC Online Domain Tools @@ -742,26 +715,24 @@ Openstat\/ OpenVAS Optimizer Orbiter -OrgProbe\/[0-9] +OrgProbe\/ orion-semantics Outlook-Express ow\.ly Owler ownCloud News -OxfordCloudService\/[0-9] -Page Analyzer +OxfordCloudService Page Valet page_verifier -page\ scorer +page scorer page2rss -PageAnalyzer PageGrabber PagePeeker PageScorer -Pagespeed\/[0-9] +Pagespeed\/ Panopta panscient -Papa\ Foto +Papa Foto parsijoo Pavuk PayPal IPN @@ -782,7 +753,7 @@ Picscout Picsearch PictureFinder Pimonster -ping\.blo\.gs\/ +ping\.blo\.gs Pingability PingAdmin\.Ru Pingdom @@ -803,17 +774,16 @@ Pompos Porkbun Port Monitor postano -PostmanRuntime\/ +PostmanRuntime PostPost postrank PowerPoint\/ Priceonomics Analysis Engine -PrintFriendly\.com -PritTorrent\/[0-9] +PrintFriendly +PritTorrent Prlog probethenet Project 25499 -Promotion_Tools_www\.searchenginepromotionhelp\.com prospectb2b Protopage ProWebWalker @@ -831,8 +801,8 @@ Qirina Hurdler QQDownload QrafterPro Qseero -Qualidator\.com SiteAnalyzer -QueryN\ Metasearch +Qualidator +QueryN Metasearch queuedriver Quora Link Preview Qwantify @@ -848,22 +818,19 @@ RebelMouse Recorder RecurPost\/ redback\/ -Redirect Checker Tool ReederForMac ReGet RepoMonkey request\.js -ResponseCodeTest\/[0-9] +ResponseCodeTest RestSharp Riddler Rival IQ Robosourcer -Robozilla\/[0-9] +Robozilla ROI Hunter RPT-HTTPClient -rss reader RSSOwl -RssReader\/ safe-agent-scanner SalesIntelligent Saleslift @@ -881,11 +848,12 @@ ScrapeBox Page Scanner SimpleScraper Scrapy Screaming -ScreenShotService\/[0-9] +ScreenShotService Scrubby Scrutiny\/ search\.thunderstone -Search37\/ +Search37 +searchenginepromotionhelp Searchestate SearchExpress SearchSight @@ -905,29 +873,25 @@ SeoCheck SEOkicks Seomoz SEOprofiler -SEOsearch\/ -SeopultContentAnalyzer +SEOsearch seoscanners SEOstats -Server Density Service Monitoring -servernfo\.com +servernfo sexsearcher Seznam Shelob Shodan -Shoppimon Analyzer -ShoppimonAgent\/[0-9] +Shoppimon ShopWiki ShortLinkTranslate shrinktheweb Sideqik -SilverReader SimplePie SimplyFast Siphon SISTRIX Site-Shot\/ -Site\ Sucker +Site Sucker Site24x7 SiteBar Sitebeam @@ -995,7 +959,6 @@ StumbleUpon suchen Sucuri summify -Super Monitoring SuperHTTP Surphace Scout Suzuran @@ -1017,16 +980,15 @@ Telesphoreo Telesphorep Tenon\.io teoma -terrainformatica\.com +terrainformatica Test Certificate Info -Tetrahedron\/[0-9] +Tetrahedron The Drop Reaper The Expert HTML Source Viewer The Knowledge AI -The\ Intraformant +The Intraformant theinternetrules TheNomad -theoldreader\.com Thinklab Thumbshots ThumbSniper @@ -1042,13 +1004,13 @@ tracemyfile TrapitAgent Trendiction Trendsmap -trendspottr\.com +trendspottr truwoGPS TryJsoup TulipChain Turingos Turnitin -tweetedtimes\.com +tweetedtimes Tweetminster Tweezler\/ twibble @@ -1070,16 +1032,15 @@ updown\.io daemon Upflow Uptimia URL Verifier -URLChecker -URLitor\.com +URLitor urlresolver Urlstat UrlTrends Ranking Updater -URLy\ Warning +URLy Warning URLy\.Warning Vacuum Vagabondo -VB\ Project +VB Project vBSEO VCI via ggpht\.com GoogleImageProxy @@ -1091,12 +1052,11 @@ VoidEYE Voil voltron voyager\/ -VSAgent\/[0-9] -VSB-TUO\/[0-9] +VSAgent\/ +VSB-TUO\/ Vulnbusters Meter VYU2 w3af\.org -W3C_I18n-Checker W3C_Unicorn W3C-checklink W3C-mobileOK @@ -1109,16 +1069,15 @@ WatchMouse WbSrch\/ WDT\.io web-capture\.net -Web-Monitoring Web-sniffer -Web\ Auto -Web\ Collage -Web\ Enhancer -Web\ Fetch -Web\ Fuck -Web\ Pix -Web\ Sauger -Web\ Sucker +Web Auto +Web Collage +Web Enhancer +Web Fetch +Web Fuck +Web Pix +Web Sauger +Web Sucker Webalta Webauskunft WebAuto @@ -1128,12 +1087,12 @@ webcollage WebCookies WebCopier WebCorp -WebDataStats\/[0-9] +WebDataStats WebDoc WebEnhancer WebFetch WebFuck -WebGo\ IS +WebGo IS WebImageCollector WebImages WebIndex @@ -1147,17 +1106,14 @@ WebSauger webscreenie Webshag Webshot -Website Analyzer\/ -Website\ Quester -WebsiteExtractor +Website Quester websitepulse agent -websitepulse[+ ]checker WebsiteQuester -Websnapr\/ +Websnapr Webster WebStripper WebSucker -Webthumb\/[0-9] +Webthumb\/ WebThumbnail WebWhacker WebZIP @@ -1199,13 +1155,12 @@ x09Mozilla x22Mozilla XaxisSemanticsClassifier Xenu Link Sleuth -XING-contenttabreceiver\/[0-9] +XING-contenttabreceiver xpymep([0-9]?)\.exe Y!J-(ASR|BSC) Y\!J-BRW Yaanb yacy -Yahoo Ad monitoring Yahoo Link Preview YahooCacheSystem YahooYSMcm @@ -1217,7 +1172,7 @@ Yo-yo Yoleo Consumer yoogliFetchAgent YottaaMonitor -Your-Website-Sucks\/[0-9] +Your-Website-Sucks yourls\.org YoYs\.net YP\.PL @@ -1235,4 +1190,4 @@ ZnajdzFoto Zombie\.js Zoom\.Mac ZyBorg -[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron) \ No newline at end of file +[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer) \ No newline at end of file