From 5028e17431627585b101f60b857dc790ec0a84aa Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Mon, 6 Nov 2017 13:32:48 -0500 Subject: [PATCH 01/23] complexity --- quantgov/corpora/measure_complexity.py | 108 ++++ quantgov/examples/baseball.txt | 799 +++++++++++++++++++++++++ setup.py | 4 + 3 files changed, 911 insertions(+) create mode 100644 quantgov/corpora/measure_complexity.py create mode 100644 quantgov/examples/baseball.txt diff --git a/quantgov/corpora/measure_complexity.py b/quantgov/corpora/measure_complexity.py new file mode 100644 index 0000000..0f20842 --- /dev/null +++ b/quantgov/corpora/measure_complexity.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +import argparse +import csv +import collections +import concurrent.futures +import io +import math +import logging +import sys +import re + +from nltk.corpus import stopwords + +from textblob import Word +# +from pathlib import Path + +# +ENCODE_IN = 'utf-8' +ENCODE_OUT = 'utf-8' + + +CYCLOMATICS = re.compile( + r'\b(if|but|except|provided|when|where|whenever|unless|notwithstanding' + r'|in\s+the\s+event|in\s+no\s+event)\b' +) + +WORDS = re.compile(r'\b\w+\b') + +LEMMAS = {} +STOPWORDS = set(stopwords.words('english')) + +log = logging.getLogger(Path(__file__).stem) + + +def lemmatize(word): + if word in LEMMAS: + lemma = LEMMAS[word] + else: + lemma = Word(word).lemmatize() + LEMMAS[word] = lemma + return lemma + + +def count_cyclomatics(text): + return len(CYCLOMATICS.findall(' '.join(text.splitlines()))) + + +def get_shannon_entropy(text, words): + lemmas = [ + lemma for lemma in ( + lemmatize(word) for word in words + ) + if lemma not in STOPWORDS + ] + counts = collections.Counter(lemmas) + return round(sum( + -(count / len(lemmas) * math.log(count / len(lemmas), 2)) + for count in counts.values() + ), 2) + + +def get_row_for_file(path): + text = path.read_text(encoding=ENCODE_IN).lower() + file = path.stem + words = WORDS.findall(text) + return ( + file, len(words), len(set(words)), count_cyclomatics(text), + get_shannon_entropy(text, words) + ) + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('indir', type=Path) + parser.add_argument('-o', '--outfile', + type=lambda x: open( + x, 'w', newline='', encoding=ENCODE_OUT), + default=io.TextIOWrapper( + sys.stdout.buffer, encoding=ENCODE_OUT) + ) + verbosity = parser.add_mutually_exclusive_group() + verbosity.add_argument('-v', '--verbose', action='store_const', + const=logging.DEBUG, default=logging.INFO) + verbosity.add_argument('-q', '--quiet', dest='verbose', + action='store_const', const=logging.WARNING) + return parser.parse_args() + + +def main(): + args = parse_args() + logging.basicConfig(level=args.verbose) + writer = csv.writer(args.outfile) + writer.writerow( + ('file', 'words', 'unique words', + 'cyclomatic_complexity', 'shannon_entropy') + ) + with concurrent.futures.ProcessPoolExecutor() as pool: + for file, words, uniques, cyclo, entropy in pool.map( + get_row_for_file, args.indir.iterdir() + ): + log.info(f'finished {file}') + writer.writerow((file, words, uniques, cyclo, entropy)) + + +if __name__ == "__main__": + main() diff --git a/quantgov/examples/baseball.txt b/quantgov/examples/baseball.txt new file mode 100644 index 0000000..de713b7 --- /dev/null +++ b/quantgov/examples/baseball.txt @@ -0,0 +1,799 @@ +The Lahman Baseball Database + +2016 Version +Release Date: February 25, 2017 + + + +README CONTENTS +0.1 Copyright Notice +0.2 Contact Information + +1.0 Release Contents +1.1 Introduction +1.2 What's New +1.3 Acknowledgements +1.4 Using this Database +1.5 Revision History + +2.0 Data Tables + + + +0.1 Copyright Notice & Limited Use License + +This database is copyright 1996-2017 by Sean Lahman. + +This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. For details see: http://creativecommons.org/licenses/by-sa/3.0/ + + +For licensing information or further information, contact Sean Lahman +at: seanlahman@gmail.com + + + +0.2 Contact Information + +Web site: http://www.baseball1.com +E-Mail : seanlahman@gmail.com + +If you're interested in contributing to the maintenance of this +database or making suggestions for improvement, please consider +joining our mailinglist at: + + http://groups.yahoo.com/group/baseball-databank/ + +If you are interested in similar databases for other sports, please +vist the Open Source Sports website at http://OpenSourceSports.com + + +1.0 Release Contents + +This release of the database can be downloaded in several formats. The +contents of each version are listed below. + +MS Access Versions: + lahman2016.mdb + 2016readme.txt + +SQL version + lahman2016.sql + 2016readme.txt + +Comma Delimited Version: + 2016readme.txt + AllStarFull.csv + Appearances.csv + AwardsManagers.csv + AwardsPlayers.csv + AwardsShareManagers.csv + AwardsSharePlayers.csv + Batting.csv + BattingPost.csv + CollegePlaying.csv + Fielding.csv + FieldingOF.csv + FieldingPost.csv + FieldingOFsplit + HallOfFame.csv + HomeGames.csv + Managers.csv + ManagersHalf.csv + Master.csv + Parks.csv + Pitching.csv + PitchingPost.csv + Salaries.csv + Schools.csv + SeriesPost.csv + Teams.csv + TeamsFranchises.csv + TeamsHalf.csv + + +1.1 Introduction + +This database contains pitching, hitting, and fielding statistics for +Major League Baseball from 1871 through 2016. It includes data from +the two current leagues (American and National), the four other "major" +leagues (American Association, Union Association, Players League, and +Federal League), and the National Association of 1871-1875. + +This database was created by Sean Lahman, who pioneered the effort to +make baseball statistics freely available to the general public. What +started as a one man effort in 1994 has grown tremendously, and now a +team of researchers have collected their efforts to make this the +largest and most accurate source for baseball statistics available +anywhere. (See Acknowledgements below for a list of the key +contributors to this project.) + +None of what we have done would have been possible without the +pioneering work of Hy Turkin, S.C. Thompson, David Neft, and Pete +Palmer (among others). All baseball fans owe a debt of gratitude +to the people who have worked so hard to build the tremendous set +of data that we have today. Our thanks also to the many members of +the Society for American Baseball Research who have helped us over +the years. We strongly urge you to support and join their efforts. +Please vist their website (www.sabr.org). + +If you have any problems or find any errors, please let us know. Any +feedback is appreciated + + +1.2 What's New in 2016 + +Player stats have been updated through 2016 season. + +Three new tables have been introduced: FieldingOFsplit, Parks, and HomeGames + +Other notable changes include: +* Improvements to appearances table based on Retrosheet data +* Created a new table FieldingOFsplit, and migrated the LF/CF/RF entries from Fielding to it. +* Deleted all DH entries from Fielding. +* Re-built all regular season batting/pitching/fielding from 2000 forward. +* Filled in NULL values for 1973-1999 in batting for pitchers who did not have a PA due to the DH rule. +* Added Parks and HomeGames to better track major league ballparks, including neutral sites + + +1.3 Acknowledgements + +Much of the raw data contained in this database comes from the work of +Pete Palmer, the legendary statistician, who has had a hand in most +of the baseball encylopedias published since 1974. He is largely +responsible for bringing the batting, pitching, and fielding data out +of the dark ages and into the computer era. Without him, none of this +would be possible. For more on Pete's work, please read his own +account at: http://sabr.org/cmsfiles/PalmerDatabaseHistory.pdf + +Three people have been key contributors to the work that followed, first +by taking the raw data and creating a relational database, and later +by extending the database to make it more accesible to researchers. + +Sean Lahman launched the Baseball Archive's website back before +most people had heard of the world wide web. Frustrated by the +lack of sports data available, he led the effort to build a +baseball database that everyone could use. He created the first version +of the database and began to make it available for free download from +his website in 1995. + +The work of Sean Forman to create and maintain an online encyclopedia +at Baseball-Reference.com was a quantum leap for both fans and researchers. +The website launched in 2000, provding a user-friendly interface to the Lahman +Baseball Database. Forman and Lahman launched the Baseball Databank in 2001, +a group of researchers whose goal was to update and maintain the database +as an open source collection available to all. + +Ted Turocy has done the lion's share of the work to updating the main +data tables since 2012, automating the work of annual updates and linking +historical data to play-by-play accounts compiled by Retrosheet. + +A handful of researchers have made substantial contributions to +maintain this database over years. Listed alphabetically, they +are: Derek Adair, Mike Crain, Kevin Johnson, Rod Nelson, Tom Tango, +and Paul Wendt. These folks did much of the heavy lifting, and are +largely responsible for the improvements made since 2000. + +Others who made important contributions include: Dvd Avins, +Clifford Blau, Bill Burgess, Clem Comly, Jeff Burk, Randy Cox, +Mitch Dickerman, Paul DuBois, Mike Emeigh, F.X. Flinn, Bill Hickman, +Jerry Hoffman, Dan Holmes, Micke Hovmoller, Peter Kreutzer, +Danile Levine, Bruce Macleod, Ken Matinale, Michael Mavrogiannis, +Cliff Otto, Alberto Perdomo, Dave Quinn, John Rickert, Tom Ruane, +Theron Skyles, Hans Van Slooten, Michael Westbay, and Rob Wood. + +Many other people have made significant contributions to the database +over the years. The contribution of Tom Ruane's effort to the overall +quality of the underlying data has been tremendous. His work at +retrosheet.org integrates the yearly data with the day-by-day data, +creating a reference source of startling depth. + +Sean Holtz helped with a major overhaul and redesign before the +2000 season. Keith Woolner was instrumental in helping turn +a huge collection of stats into a relational database in the mid-1990s. +Clifford Otto & Ted Nye also helped provide guidance to the early +versions. Lee Sinnis, John Northey & Erik Greenwood helped supply key +pieces of data. Many others have written in with corrections and +suggestions that made each subsequent version even better than what +preceded it. + +The work of the SABR Baseball Records Committee, led by Lyle Spatz +has been invaluable. So has the work of Bill Carle and the SABR +Biographical Committee. David Vincent, keeper of the Home Run Log and +other bits of hard to find info, has always been helpful. The recent +addition of colleges to player bios is the result of much research by +members of SABR's Collegiate Baseball committee. + +Salary data was first supplied by Doug Pappas, who passed away during +the summer of 2004. He was the leading authority on many subjects, +most significantly the financial history of Major League Baseball. +We are grateful that he allowed us to include some of the data he +compiled. His work has been continued by the SABR Business of +Baseball committee. + +Thanks is also due to the staff at the National Baseball Library +in Cooperstown who have been so helpful over the years, including +Tim Wiles, Jim Gates, Bruce Markusen, and the rest of the staff. + +A special debt of gratitude is owed to Dave Smith and the folks at +Retrosheet. There is no other group working so hard to compile and +share baseball data. Their website (www.retrosheet.org) will give +you a taste of the wealth of information Dave and the gang have collected. + +Thanks to all contributors great and small. What you have created is +a wonderful thing. + + +1.4 Using this Database + +This version of the database is available in Microsoft Access +format, SQL files or in a generic, comma delimited format. Because this is a +relational database, you will not be able to use the data in a +flat-database application. + +Please note that this is not a stand alone application. It requires +a database application or some other application designed specifically +to interact with the database. + +If you are unable to import the data directly, you should download the +database in the delimted text format. Then use the documentation +in section 2.0 of this document to import the data into +your database application. + + +1.5 Revision History + + Version Date Comments + 1.0 December 1992 Database ported from dBase + 1.1 May 1993 Becomes fully relational + 1.2 July 1993 Corrections made to full database + 1.21 December 1993 1993 statistics added + 1.3 July 1994 Pre-1900 data added + 1.31 February 1995 1994 Statistics added + 1.32 August 1995 Statistics added for other leagues + 1.4 September 1995 Fielding Data added + 1.41 November 1995 1995 statistics added + 1.42 March 1996 HOF/All-Star tables added + 1.5-MS October 1996 1st public release - MS Access format + 1.5-GV October 1996 Released generic comma-delimted files + 1.6-MS December 1996 Updated with 1996 stats, some corrections + 1.61-MS December 1996 Corrected error in MASTER table + 1.62 February 1997 Corrected 1914-1915 batters data and updated + 2.0 February 1998 Major Revisions-added teams & managers + 2.1 October 1998 Interim release w/1998 stats + 2.2 January 1999 New release w/post-season stats & awards added + 3.0 November 1999 Major release - fixed errors and 1999 statistics added + 4.0 May 2001 Major release - proofed & redesigned tables + 4.5 March 2002 Updated with 2001 stats and added new biographical data + 5.0 December 2002 Major revision - new tables and data + 5.1 January 2004 Updated with 2003 data, and new pitching categories + 5.2 November 2004 Updated with 2004 season statistics. + 5.3 December 2005 Updated with 2005 season statistics. + 5.4 December 2006 Updated with 2006 season statistics. + 5.5 December 2007 Updated with 2007 season statistics. + 5.6 December 2008 Updated with 2008 season statistics. + 5.7 December 2009 Updated for 2009 and added several tables. + 5.8 December 2010 Updated with 2010 season statistics. + 5.9 December 2011 Updated for 2011 and removed obsolete tables. + 2012 December 2012 Updated with 2012 season statistics + 2013 December 2013 Updated with 2013 season statistics + 2014 December 2014 Updated with 2014 season statistics + 2015 December 2015 Updated with 2015 season statistics + 2016 February 2017 Updated for 201g and added several tables + + +2.0 Data Tables + +The design follows these general principles. Each player is assigned a +unique number (playerID). All of the information relating to that player +is tagged with his playerID. The playerIDs are linked to names and +birthdates in the MASTER table. + +The database is comprised of the following main tables: + + MASTER - Player names, DOB, and biographical info + Batting - batting statistics + Pitching - pitching statistics + Fielding - fielding statistics + +It is supplemented by these tables: + + AllStarFull - All-Star appearances + HallofFame - Hall of Fame voting data + Managers - managerial statistics + Teams - yearly stats and standings + BattingPost - post-season batting statistics + PitchingPost - post-season pitching statistics + TeamFranchises - franchise information + FieldingOF - outfield position data + FieldingPost- post-season fielding data + FieldingOFsplit - LF/CF/RF splits + ManagersHalf - split season data for managers + TeamsHalf - split season data for teams + Salaries - player salary data + SeriesPost - post-season series information + AwardsManagers - awards won by managers + AwardsPlayers - awards won by players + AwardsShareManagers - award voting for manager awards + AwardsSharePlayers - award voting for player awards + Appearances - details on the positions a player appeared at + Schools - list of colleges that players attended + CollegePlaying - list of players and the colleges they attended + Parks - list of major league ballparls + HomeGames - Number of homegames played by each team in each ballpark + + + + +2.1 MASTER table + + +playerID A unique code asssigned to each player. The playerID links + the data in this file with records in the other files. +birthYear Year player was born +birthMonth Month player was born +birthDay Day player was born +birthCountry Country where player was born +birthState State where player was born +birthCity City where player was born +deathYear Year player died +deathMonth Month player died +deathDay Day player died +deathCountry Country where player died +deathState State where player died +deathCity City where player died +nameFirst Player's first name +nameLast Player's last name +nameGiven Player's given name (typically first and middle) +weight Player's weight in pounds +height Player's height in inches +bats Player's batting hand (left, right, or both) +throws Player's throwing hand (left or right) +debut Date that player made first major league appearance +finalGame Date that player made first major league appearance (blank if still active) +retroID ID used by retrosheet +bbrefID ID used by Baseball Reference website + + + +2.2 Batting Table +playerID Player ID code +yearID Year +stint player's stint (order of appearances within a season) +teamID Team +lgID League +G Games +AB At Bats +R Runs +H Hits +2B Doubles +3B Triples +HR Homeruns +RBI Runs Batted In +SB Stolen Bases +CS Caught Stealing +BB Base on Balls +SO Strikeouts +IBB Intentional walks +HBP Hit by pitch +SH Sacrifice hits +SF Sacrifice flies +GIDP Grounded into double plays + + +2.3 Pitching table + +playerID Player ID code +yearID Year +stint player's stint (order of appearances within a season) +teamID Team +lgID League +W Wins +L Losses +G Games +GS Games Started +CG Complete Games +SHO Shutouts +SV Saves +IPOuts Outs Pitched (innings pitched x 3) +H Hits +ER Earned Runs +HR Homeruns +BB Walks +SO Strikeouts +BAOpp Opponent's Batting Average +ERA Earned Run Average +IBB Intentional Walks +WP Wild Pitches +HBP Batters Hit By Pitch +BK Balks +BFP Batters faced by Pitcher +GF Games Finished +R Runs Allowed +SH Sacrifices by opposing batters +SF Sacrifice flies by opposing batters +GIDP Grounded into double plays by opposing batter + +2.4 Fielding Table + +playerID Player ID code +yearID Year +stint player's stint (order of appearances within a season) +teamID Team +lgID League +Pos Position +G Games +GS Games Started +InnOuts Time played in the field expressed as outs +PO Putouts +A Assists +E Errors +DP Double Plays +PB Passed Balls (by catchers) +WP Wild Pitches (by catchers) +SB Opponent Stolen Bases (by catchers) +CS Opponents Caught Stealing (by catchers) +ZR Zone Rating + + +2.5 AllstarFull table + +playerID Player ID code +YearID Year +gameNum Game number (zero if only one All-Star game played that season) +gameID Retrosheet ID for the game idea +teamID Team +lgID League +GP 1 if Played in the game +startingPos If player was game starter, the position played + +2.6 HallOfFame table + +playerID Player ID code +yearID Year of ballot +votedBy Method by which player was voted upon +ballots Total ballots cast in that year +needed Number of votes needed for selection in that year +votes Total votes received +inducted Whether player was inducted by that vote or not (Y or N) +category Category in which candidate was honored +needed_note Explanation of qualifiers for special elections + +2.7 Managers table + +playerID Player ID Number +yearID Year +teamID Team +lgID League +inseason Managerial order. Zero if the individual managed the team + the entire year. Otherwise denotes where the manager appeared + in the managerial order (1 for first manager, 2 for second, etc.) +G Games managed +W Wins +L Losses +rank Team's final position in standings that year +plyrMgr Player Manager (denoted by 'Y') + + +2.8 Teams table + +yearID Year +lgID League +teamID Team +franchID Franchise (links to TeamsFranchise table) +divID Team's division +Rank Position in final standings +G Games played +GHome Games played at home +W Wins +L Losses +DivWin Division Winner (Y or N) +WCWin Wild Card Winner (Y or N) +LgWin League Champion(Y or N) +WSWin World Series Winner (Y or N) +R Runs scored +AB At bats +H Hits by batters +2B Doubles +3B Triples +HR Homeruns by batters +BB Walks by batters +SO Strikeouts by batters +SB Stolen bases +CS Caught stealing +HBP Batters hit by pitch +SF Sacrifice flies +RA Opponents runs scored +ER Earned runs allowed +ERA Earned run average +CG Complete games +SHO Shutouts +SV Saves +IPOuts Outs Pitched (innings pitched x 3) +HA Hits allowed +HRA Homeruns allowed +BBA Walks allowed +SOA Strikeouts by pitchers +E Errors +DP Double Plays +FP Fielding percentage +name Team's full name +park Name of team's home ballpark +attendance Home attendance total +BPF Three-year park factor for batters +PPF Three-year park factor for pitchers +teamIDBR Team ID used by Baseball Reference website +teamIDlahman45 Team ID used in Lahman database version 4.5 +teamIDretro Team ID used by Retrosheet + + +2.9 BattingPost table + +yearID Year +round Level of playoffs +playerID Player ID code +teamID Team +lgID League +G Games +AB At Bats +R Runs +H Hits +2B Doubles +3B Triples +HR Homeruns +RBI Runs Batted In +SB Stolen Bases +CS Caught stealing +BB Base on Balls +SO Strikeouts +IBB Intentional walks +HBP Hit by pitch +SH Sacrifices +SF Sacrifice flies +GIDP Grounded into double plays + + +2.10 PitchingPost table + +playerID Player ID code +yearID Year +round Level of playoffs +teamID Team +lgID League +W Wins +L Losses +G Games +GS Games Started +CG Complete Games +SHO Shutouts +SV Saves +IPOuts Outs Pitched (innings pitched x 3) +H Hits +ER Earned Runs +HR Homeruns +BB Walks +SO Strikeouts +BAOpp Opponents' batting average +ERA Earned Run Average +IBB Intentional Walks +WP Wild Pitches +HBP Batters Hit By Pitch +BK Balks +BFP Batters faced by Pitcher +GF Games Finished +R Runs Allowed +SH Sacrifice Hits allowed +SF Sacrifice Flies allowed +GIDP Grounded into Double Plays + + +2.11 TeamFranchises table + +franchID Franchise ID +franchName Franchise name +active Whetehr team is currently active (Y or N) +NAassoc ID of National Association team franchise played as + + +2.12 FieldingOF table + +playerID Player ID code +yearID Year +stint player's stint (order of appearances within a season) +Glf Games played in left field +Gcf Games played in center field +Grf Games played in right field + + +2.13 ManagersHalf table + +playerID Manager ID code +yearID Year +teamID Team +lgID League +inseason Managerial order. One if the individual managed the team + the entire year. Otherwise denotes where the manager appeared + in the managerial order (1 for first manager, 2 for second, etc.) +half First or second half of season +G Games managed +W Wins +L Losses +rank Team's position in standings for the half + + +2.14 TeamsHalf table + +yearID Year +lgID League +teamID Team +half First or second half of season +divID Division +DivWin Won Division (Y or N) +rank Team's position in standings for the half +G Games played +W Wins +L Losses + + +2.15 Salaries table + +yearID Year +teamID Team +lgID League +playerID Player ID code +salary Salary + + +2.16 SeriesPost table + +yearID Year +round Level of playoffs +teamIDwinner Team ID of the team that won the series +lgIDwinner League ID of the team that won the series +teamIDloser Team ID of the team that lost the series +lgIDloser League ID of the team that lost the series +wins Wins by team that won the series +losses Losses by team that won the series +ties Tie games + +2.17 AwardsManagers table + +playerID Manager ID code +awardID Name of award won +yearID Year +lgID League +tie Award was a tie (Y or N) +notes Notes about the award + + +2.18 AwardsPlayers table + +playerID Player ID code +awardID Name of award won +yearID Year +lgID League +tie Award was a tie (Y or N) +notes Notes about the award + + +2.19 AwardsShareManagers table + +awardID name of award votes were received for +yearID Year +lgID League +playerID Manager ID code +pointsWon Number of points received +pointsMax Maximum numner of points possible +votesFirst Number of first place votes + + +2.20 AwardsSharePlayers table + +awardID name of award votes were received for +yearID Year +lgID League +playerID Player ID code +pointsWon Number of points received +pointsMax Maximum numner of points possible +votesFirst Number of first place votes + + +2.21 FieldingPost table + +playerID Player ID code +yearID Year +teamID Team +lgID League +round Level of playoffs +Pos Position +G Games +GS Games Started +InnOuts Time played in the field expressed as outs +PO Putouts +A Assists +E Errors +DP Double Plays +TP Triple Plays +PB Passed Balls +SB Stolen Bases allowed (by catcher) +CS Caught Stealing (by catcher) + + +2.22 Appearances table + +yearID Year +teamID Team +lgID League +playerID Player ID code +G_all Total games played +GS Games started +G_batting Games in which player batted +G_defense Games in which player appeared on defense +G_p Games as pitcher +G_c Games as catcher +G_1b Games as firstbaseman +G_2b Games as secondbaseman +G_3b Games as thirdbaseman +G_ss Games as shortstop +G_lf Games as leftfielder +G_cf Games as centerfielder +G_rf Games as right fielder +G_of Games as outfielder +G_dh Games as designated hitter +G_ph Games as pinch hitter +G_pr Games as pinch runner + + + +2.23 Schools table +schoolID school ID code +schoolName school name +schoolCity city where school is located +schoolState state where school's city is located +schoolNick nickname for school's baseball team + + + +2.24 CollegePlaying table +playerid Player ID code +schoolID school ID code +year year + + + + +2.25 FieldingOFsplit table +playerID Player ID code +yearID Year +stint player's stint (order of appearances within a season) +teamID Team +lgID League +Pos Position +G Games +GS Games Started +InnOuts Time played in the field expressed as outs +PO Putouts +A Assists +E Errors +DP Double Plays + + + +2.26 Parks table +park.key ballpark ID code +park.name name of ballpark +park.alias alternate names of ballpark +city city +state state +country country + + +2.27 HomeGames table +year.key year +league.key league +team.key team ID +park.key ballpark ID +span.first date of first game played +span.last date of last game played +games total number of games +openings total number of dates played +attendance total attendaance \ No newline at end of file diff --git a/setup.py b/setup.py index b733fe1..345a464 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,10 @@ def find_version(*file_paths): ], extras_require={ 'testing': ['pytest-flake8'] + 'complexity': [ + 'textblob,' + 'nltk' + ] }, entry_points={ 'console_scripts': [ From f540d171471818929145b334ce44e17f43cb90e3 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Mon, 6 Nov 2017 15:49:29 -0500 Subject: [PATCH 02/23] complexity builtins --- quantgov/corpora/builtins.py | 120 ++++ quantgov/corpora/measure_complexity.py | 108 ---- quantgov/examples/baseball.txt | 799 ------------------------- 3 files changed, 120 insertions(+), 907 deletions(-) delete mode 100644 quantgov/corpora/measure_complexity.py delete mode 100644 quantgov/examples/baseball.txt diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 5dbf1d8..f642579 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -6,6 +6,9 @@ import quantgov +from nltk.corpus import stopwords +from textblob import Word + commands = {} @@ -93,3 +96,120 @@ def process_document(doc, terms, pattern, total_label): commands['count_occurrences'] = OccurrenceCounter + + +class ShannonEntropy(): + + cli = quantgov.utils.CLISpec( + help='Shannon Entropy', + arguments=[ + quantgov.utils.CLIArg( + flags=('--word_pattern', '-wp'), + kwargs={ + 'help': 'regular expression defining a "word"', + 'type': re.compile, + 'default': re.compile(r'\b\w+\b') + } + ) + quantgov.utils.CLIArg( + flags=('--stopwords', '-sw'), + kwargs={ + 'help': 'stopwords to ignore', + 'default': set(stopwords.words('english')) + } + ) + ] + + ) + + @staticmethod + def get_columns(args): + return ('shannon_entropy',) + + @staticmethod + def process_document(doc, word_pattern, stopwords): + words = word_pattern.findall(doc.text) + lemmas = [ + lemma for lemma in ( + Word(word).lemmatize() for word in words + ) + if lemma not in stopwords + ] + counts = collections.Counter(lemmas) + return round(sum( + -(count / len(lemmas) * math.log(count / len(lemmas), 2)) + for count in counts.values() + ), 2) + + +commands['shannon_entropy'] = ShannonEntropy + + +class ConditionalCounter(): + + cli = quantgov.utils.CLISpec( + help='Conditional Counter', + arguments=[ + quantgov.utils.CLIArg( + flags=('--conditional_pattern', '-cp'), + kwargs={ + 'help': 'regular expression defining a "conditional"', + 'type': re.compile, + 'default': re.compile( + r'\b(if|but|except|provided|when|where|whenever|unless|notwithstanding' + r'|in\s+the\s+event|in\s+no\s+event)\b') + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('conditional_count',) + + @staticmethod + def process_document(doc, conditional_pattern): + return len(pattern.findall(' '.join((doc.text).splitlines()))) + + +commands['conditional_count'] = ConditionalCounter + + +class SentenceLength(): + + cli = quantgov.utils.CLISpec( + help='Sentence Length', + arguments=[ + quantgov.utils.CLIArg( + flags=('--word_pattern', '-wp'), + kwargs={ + 'help': 'regular expression defining a "word"', + 'type': re.compile, + 'default': re.compile(r'\b\w+\b') + } + ) + quantgov.utils.CLIArg( + flags=('--sentence_pattern', '-sp'), + kwargs={ + 'help': 'regular expression defining a "sentence"', + 'type': re.compile, + 'default': re.compile(r'[A-Z][^\.!?]*[\.!?]') + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('sentence_length',) + + @staticmethod + def process_document(doc, word_pattern, sentence_pattern): + sentences = sentence_pattern.findall(doc) + total_length = 0 + for sentence in sentences: + total_length += len(word_pattern.findall(sentence)) + return total_length / len(sentences) + + +commands['sentence_length'] = SentenceLength \ No newline at end of file diff --git a/quantgov/corpora/measure_complexity.py b/quantgov/corpora/measure_complexity.py deleted file mode 100644 index 0f20842..0000000 --- a/quantgov/corpora/measure_complexity.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python -import argparse -import csv -import collections -import concurrent.futures -import io -import math -import logging -import sys -import re - -from nltk.corpus import stopwords - -from textblob import Word -# -from pathlib import Path - -# -ENCODE_IN = 'utf-8' -ENCODE_OUT = 'utf-8' - - -CYCLOMATICS = re.compile( - r'\b(if|but|except|provided|when|where|whenever|unless|notwithstanding' - r'|in\s+the\s+event|in\s+no\s+event)\b' -) - -WORDS = re.compile(r'\b\w+\b') - -LEMMAS = {} -STOPWORDS = set(stopwords.words('english')) - -log = logging.getLogger(Path(__file__).stem) - - -def lemmatize(word): - if word in LEMMAS: - lemma = LEMMAS[word] - else: - lemma = Word(word).lemmatize() - LEMMAS[word] = lemma - return lemma - - -def count_cyclomatics(text): - return len(CYCLOMATICS.findall(' '.join(text.splitlines()))) - - -def get_shannon_entropy(text, words): - lemmas = [ - lemma for lemma in ( - lemmatize(word) for word in words - ) - if lemma not in STOPWORDS - ] - counts = collections.Counter(lemmas) - return round(sum( - -(count / len(lemmas) * math.log(count / len(lemmas), 2)) - for count in counts.values() - ), 2) - - -def get_row_for_file(path): - text = path.read_text(encoding=ENCODE_IN).lower() - file = path.stem - words = WORDS.findall(text) - return ( - file, len(words), len(set(words)), count_cyclomatics(text), - get_shannon_entropy(text, words) - ) - - -def parse_args(): - """Parse command line arguments.""" - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('indir', type=Path) - parser.add_argument('-o', '--outfile', - type=lambda x: open( - x, 'w', newline='', encoding=ENCODE_OUT), - default=io.TextIOWrapper( - sys.stdout.buffer, encoding=ENCODE_OUT) - ) - verbosity = parser.add_mutually_exclusive_group() - verbosity.add_argument('-v', '--verbose', action='store_const', - const=logging.DEBUG, default=logging.INFO) - verbosity.add_argument('-q', '--quiet', dest='verbose', - action='store_const', const=logging.WARNING) - return parser.parse_args() - - -def main(): - args = parse_args() - logging.basicConfig(level=args.verbose) - writer = csv.writer(args.outfile) - writer.writerow( - ('file', 'words', 'unique words', - 'cyclomatic_complexity', 'shannon_entropy') - ) - with concurrent.futures.ProcessPoolExecutor() as pool: - for file, words, uniques, cyclo, entropy in pool.map( - get_row_for_file, args.indir.iterdir() - ): - log.info(f'finished {file}') - writer.writerow((file, words, uniques, cyclo, entropy)) - - -if __name__ == "__main__": - main() diff --git a/quantgov/examples/baseball.txt b/quantgov/examples/baseball.txt deleted file mode 100644 index de713b7..0000000 --- a/quantgov/examples/baseball.txt +++ /dev/null @@ -1,799 +0,0 @@ -The Lahman Baseball Database - -2016 Version -Release Date: February 25, 2017 - - - -README CONTENTS -0.1 Copyright Notice -0.2 Contact Information - -1.0 Release Contents -1.1 Introduction -1.2 What's New -1.3 Acknowledgements -1.4 Using this Database -1.5 Revision History - -2.0 Data Tables - - - -0.1 Copyright Notice & Limited Use License - -This database is copyright 1996-2017 by Sean Lahman. - -This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. For details see: http://creativecommons.org/licenses/by-sa/3.0/ - - -For licensing information or further information, contact Sean Lahman -at: seanlahman@gmail.com - - - -0.2 Contact Information - -Web site: http://www.baseball1.com -E-Mail : seanlahman@gmail.com - -If you're interested in contributing to the maintenance of this -database or making suggestions for improvement, please consider -joining our mailinglist at: - - http://groups.yahoo.com/group/baseball-databank/ - -If you are interested in similar databases for other sports, please -vist the Open Source Sports website at http://OpenSourceSports.com - - -1.0 Release Contents - -This release of the database can be downloaded in several formats. The -contents of each version are listed below. - -MS Access Versions: - lahman2016.mdb - 2016readme.txt - -SQL version - lahman2016.sql - 2016readme.txt - -Comma Delimited Version: - 2016readme.txt - AllStarFull.csv - Appearances.csv - AwardsManagers.csv - AwardsPlayers.csv - AwardsShareManagers.csv - AwardsSharePlayers.csv - Batting.csv - BattingPost.csv - CollegePlaying.csv - Fielding.csv - FieldingOF.csv - FieldingPost.csv - FieldingOFsplit - HallOfFame.csv - HomeGames.csv - Managers.csv - ManagersHalf.csv - Master.csv - Parks.csv - Pitching.csv - PitchingPost.csv - Salaries.csv - Schools.csv - SeriesPost.csv - Teams.csv - TeamsFranchises.csv - TeamsHalf.csv - - -1.1 Introduction - -This database contains pitching, hitting, and fielding statistics for -Major League Baseball from 1871 through 2016. It includes data from -the two current leagues (American and National), the four other "major" -leagues (American Association, Union Association, Players League, and -Federal League), and the National Association of 1871-1875. - -This database was created by Sean Lahman, who pioneered the effort to -make baseball statistics freely available to the general public. What -started as a one man effort in 1994 has grown tremendously, and now a -team of researchers have collected their efforts to make this the -largest and most accurate source for baseball statistics available -anywhere. (See Acknowledgements below for a list of the key -contributors to this project.) - -None of what we have done would have been possible without the -pioneering work of Hy Turkin, S.C. Thompson, David Neft, and Pete -Palmer (among others). All baseball fans owe a debt of gratitude -to the people who have worked so hard to build the tremendous set -of data that we have today. Our thanks also to the many members of -the Society for American Baseball Research who have helped us over -the years. We strongly urge you to support and join their efforts. -Please vist their website (www.sabr.org). - -If you have any problems or find any errors, please let us know. Any -feedback is appreciated - - -1.2 What's New in 2016 - -Player stats have been updated through 2016 season. - -Three new tables have been introduced: FieldingOFsplit, Parks, and HomeGames - -Other notable changes include: -* Improvements to appearances table based on Retrosheet data -* Created a new table FieldingOFsplit, and migrated the LF/CF/RF entries from Fielding to it. -* Deleted all DH entries from Fielding. -* Re-built all regular season batting/pitching/fielding from 2000 forward. -* Filled in NULL values for 1973-1999 in batting for pitchers who did not have a PA due to the DH rule. -* Added Parks and HomeGames to better track major league ballparks, including neutral sites - - -1.3 Acknowledgements - -Much of the raw data contained in this database comes from the work of -Pete Palmer, the legendary statistician, who has had a hand in most -of the baseball encylopedias published since 1974. He is largely -responsible for bringing the batting, pitching, and fielding data out -of the dark ages and into the computer era. Without him, none of this -would be possible. For more on Pete's work, please read his own -account at: http://sabr.org/cmsfiles/PalmerDatabaseHistory.pdf - -Three people have been key contributors to the work that followed, first -by taking the raw data and creating a relational database, and later -by extending the database to make it more accesible to researchers. - -Sean Lahman launched the Baseball Archive's website back before -most people had heard of the world wide web. Frustrated by the -lack of sports data available, he led the effort to build a -baseball database that everyone could use. He created the first version -of the database and began to make it available for free download from -his website in 1995. - -The work of Sean Forman to create and maintain an online encyclopedia -at Baseball-Reference.com was a quantum leap for both fans and researchers. -The website launched in 2000, provding a user-friendly interface to the Lahman -Baseball Database. Forman and Lahman launched the Baseball Databank in 2001, -a group of researchers whose goal was to update and maintain the database -as an open source collection available to all. - -Ted Turocy has done the lion's share of the work to updating the main -data tables since 2012, automating the work of annual updates and linking -historical data to play-by-play accounts compiled by Retrosheet. - -A handful of researchers have made substantial contributions to -maintain this database over years. Listed alphabetically, they -are: Derek Adair, Mike Crain, Kevin Johnson, Rod Nelson, Tom Tango, -and Paul Wendt. These folks did much of the heavy lifting, and are -largely responsible for the improvements made since 2000. - -Others who made important contributions include: Dvd Avins, -Clifford Blau, Bill Burgess, Clem Comly, Jeff Burk, Randy Cox, -Mitch Dickerman, Paul DuBois, Mike Emeigh, F.X. Flinn, Bill Hickman, -Jerry Hoffman, Dan Holmes, Micke Hovmoller, Peter Kreutzer, -Danile Levine, Bruce Macleod, Ken Matinale, Michael Mavrogiannis, -Cliff Otto, Alberto Perdomo, Dave Quinn, John Rickert, Tom Ruane, -Theron Skyles, Hans Van Slooten, Michael Westbay, and Rob Wood. - -Many other people have made significant contributions to the database -over the years. The contribution of Tom Ruane's effort to the overall -quality of the underlying data has been tremendous. His work at -retrosheet.org integrates the yearly data with the day-by-day data, -creating a reference source of startling depth. - -Sean Holtz helped with a major overhaul and redesign before the -2000 season. Keith Woolner was instrumental in helping turn -a huge collection of stats into a relational database in the mid-1990s. -Clifford Otto & Ted Nye also helped provide guidance to the early -versions. Lee Sinnis, John Northey & Erik Greenwood helped supply key -pieces of data. Many others have written in with corrections and -suggestions that made each subsequent version even better than what -preceded it. - -The work of the SABR Baseball Records Committee, led by Lyle Spatz -has been invaluable. So has the work of Bill Carle and the SABR -Biographical Committee. David Vincent, keeper of the Home Run Log and -other bits of hard to find info, has always been helpful. The recent -addition of colleges to player bios is the result of much research by -members of SABR's Collegiate Baseball committee. - -Salary data was first supplied by Doug Pappas, who passed away during -the summer of 2004. He was the leading authority on many subjects, -most significantly the financial history of Major League Baseball. -We are grateful that he allowed us to include some of the data he -compiled. His work has been continued by the SABR Business of -Baseball committee. - -Thanks is also due to the staff at the National Baseball Library -in Cooperstown who have been so helpful over the years, including -Tim Wiles, Jim Gates, Bruce Markusen, and the rest of the staff. - -A special debt of gratitude is owed to Dave Smith and the folks at -Retrosheet. There is no other group working so hard to compile and -share baseball data. Their website (www.retrosheet.org) will give -you a taste of the wealth of information Dave and the gang have collected. - -Thanks to all contributors great and small. What you have created is -a wonderful thing. - - -1.4 Using this Database - -This version of the database is available in Microsoft Access -format, SQL files or in a generic, comma delimited format. Because this is a -relational database, you will not be able to use the data in a -flat-database application. - -Please note that this is not a stand alone application. It requires -a database application or some other application designed specifically -to interact with the database. - -If you are unable to import the data directly, you should download the -database in the delimted text format. Then use the documentation -in section 2.0 of this document to import the data into -your database application. - - -1.5 Revision History - - Version Date Comments - 1.0 December 1992 Database ported from dBase - 1.1 May 1993 Becomes fully relational - 1.2 July 1993 Corrections made to full database - 1.21 December 1993 1993 statistics added - 1.3 July 1994 Pre-1900 data added - 1.31 February 1995 1994 Statistics added - 1.32 August 1995 Statistics added for other leagues - 1.4 September 1995 Fielding Data added - 1.41 November 1995 1995 statistics added - 1.42 March 1996 HOF/All-Star tables added - 1.5-MS October 1996 1st public release - MS Access format - 1.5-GV October 1996 Released generic comma-delimted files - 1.6-MS December 1996 Updated with 1996 stats, some corrections - 1.61-MS December 1996 Corrected error in MASTER table - 1.62 February 1997 Corrected 1914-1915 batters data and updated - 2.0 February 1998 Major Revisions-added teams & managers - 2.1 October 1998 Interim release w/1998 stats - 2.2 January 1999 New release w/post-season stats & awards added - 3.0 November 1999 Major release - fixed errors and 1999 statistics added - 4.0 May 2001 Major release - proofed & redesigned tables - 4.5 March 2002 Updated with 2001 stats and added new biographical data - 5.0 December 2002 Major revision - new tables and data - 5.1 January 2004 Updated with 2003 data, and new pitching categories - 5.2 November 2004 Updated with 2004 season statistics. - 5.3 December 2005 Updated with 2005 season statistics. - 5.4 December 2006 Updated with 2006 season statistics. - 5.5 December 2007 Updated with 2007 season statistics. - 5.6 December 2008 Updated with 2008 season statistics. - 5.7 December 2009 Updated for 2009 and added several tables. - 5.8 December 2010 Updated with 2010 season statistics. - 5.9 December 2011 Updated for 2011 and removed obsolete tables. - 2012 December 2012 Updated with 2012 season statistics - 2013 December 2013 Updated with 2013 season statistics - 2014 December 2014 Updated with 2014 season statistics - 2015 December 2015 Updated with 2015 season statistics - 2016 February 2017 Updated for 201g and added several tables - - -2.0 Data Tables - -The design follows these general principles. Each player is assigned a -unique number (playerID). All of the information relating to that player -is tagged with his playerID. The playerIDs are linked to names and -birthdates in the MASTER table. - -The database is comprised of the following main tables: - - MASTER - Player names, DOB, and biographical info - Batting - batting statistics - Pitching - pitching statistics - Fielding - fielding statistics - -It is supplemented by these tables: - - AllStarFull - All-Star appearances - HallofFame - Hall of Fame voting data - Managers - managerial statistics - Teams - yearly stats and standings - BattingPost - post-season batting statistics - PitchingPost - post-season pitching statistics - TeamFranchises - franchise information - FieldingOF - outfield position data - FieldingPost- post-season fielding data - FieldingOFsplit - LF/CF/RF splits - ManagersHalf - split season data for managers - TeamsHalf - split season data for teams - Salaries - player salary data - SeriesPost - post-season series information - AwardsManagers - awards won by managers - AwardsPlayers - awards won by players - AwardsShareManagers - award voting for manager awards - AwardsSharePlayers - award voting for player awards - Appearances - details on the positions a player appeared at - Schools - list of colleges that players attended - CollegePlaying - list of players and the colleges they attended - Parks - list of major league ballparls - HomeGames - Number of homegames played by each team in each ballpark - - - - -2.1 MASTER table - - -playerID A unique code asssigned to each player. The playerID links - the data in this file with records in the other files. -birthYear Year player was born -birthMonth Month player was born -birthDay Day player was born -birthCountry Country where player was born -birthState State where player was born -birthCity City where player was born -deathYear Year player died -deathMonth Month player died -deathDay Day player died -deathCountry Country where player died -deathState State where player died -deathCity City where player died -nameFirst Player's first name -nameLast Player's last name -nameGiven Player's given name (typically first and middle) -weight Player's weight in pounds -height Player's height in inches -bats Player's batting hand (left, right, or both) -throws Player's throwing hand (left or right) -debut Date that player made first major league appearance -finalGame Date that player made first major league appearance (blank if still active) -retroID ID used by retrosheet -bbrefID ID used by Baseball Reference website - - - -2.2 Batting Table -playerID Player ID code -yearID Year -stint player's stint (order of appearances within a season) -teamID Team -lgID League -G Games -AB At Bats -R Runs -H Hits -2B Doubles -3B Triples -HR Homeruns -RBI Runs Batted In -SB Stolen Bases -CS Caught Stealing -BB Base on Balls -SO Strikeouts -IBB Intentional walks -HBP Hit by pitch -SH Sacrifice hits -SF Sacrifice flies -GIDP Grounded into double plays - - -2.3 Pitching table - -playerID Player ID code -yearID Year -stint player's stint (order of appearances within a season) -teamID Team -lgID League -W Wins -L Losses -G Games -GS Games Started -CG Complete Games -SHO Shutouts -SV Saves -IPOuts Outs Pitched (innings pitched x 3) -H Hits -ER Earned Runs -HR Homeruns -BB Walks -SO Strikeouts -BAOpp Opponent's Batting Average -ERA Earned Run Average -IBB Intentional Walks -WP Wild Pitches -HBP Batters Hit By Pitch -BK Balks -BFP Batters faced by Pitcher -GF Games Finished -R Runs Allowed -SH Sacrifices by opposing batters -SF Sacrifice flies by opposing batters -GIDP Grounded into double plays by opposing batter - -2.4 Fielding Table - -playerID Player ID code -yearID Year -stint player's stint (order of appearances within a season) -teamID Team -lgID League -Pos Position -G Games -GS Games Started -InnOuts Time played in the field expressed as outs -PO Putouts -A Assists -E Errors -DP Double Plays -PB Passed Balls (by catchers) -WP Wild Pitches (by catchers) -SB Opponent Stolen Bases (by catchers) -CS Opponents Caught Stealing (by catchers) -ZR Zone Rating - - -2.5 AllstarFull table - -playerID Player ID code -YearID Year -gameNum Game number (zero if only one All-Star game played that season) -gameID Retrosheet ID for the game idea -teamID Team -lgID League -GP 1 if Played in the game -startingPos If player was game starter, the position played - -2.6 HallOfFame table - -playerID Player ID code -yearID Year of ballot -votedBy Method by which player was voted upon -ballots Total ballots cast in that year -needed Number of votes needed for selection in that year -votes Total votes received -inducted Whether player was inducted by that vote or not (Y or N) -category Category in which candidate was honored -needed_note Explanation of qualifiers for special elections - -2.7 Managers table - -playerID Player ID Number -yearID Year -teamID Team -lgID League -inseason Managerial order. Zero if the individual managed the team - the entire year. Otherwise denotes where the manager appeared - in the managerial order (1 for first manager, 2 for second, etc.) -G Games managed -W Wins -L Losses -rank Team's final position in standings that year -plyrMgr Player Manager (denoted by 'Y') - - -2.8 Teams table - -yearID Year -lgID League -teamID Team -franchID Franchise (links to TeamsFranchise table) -divID Team's division -Rank Position in final standings -G Games played -GHome Games played at home -W Wins -L Losses -DivWin Division Winner (Y or N) -WCWin Wild Card Winner (Y or N) -LgWin League Champion(Y or N) -WSWin World Series Winner (Y or N) -R Runs scored -AB At bats -H Hits by batters -2B Doubles -3B Triples -HR Homeruns by batters -BB Walks by batters -SO Strikeouts by batters -SB Stolen bases -CS Caught stealing -HBP Batters hit by pitch -SF Sacrifice flies -RA Opponents runs scored -ER Earned runs allowed -ERA Earned run average -CG Complete games -SHO Shutouts -SV Saves -IPOuts Outs Pitched (innings pitched x 3) -HA Hits allowed -HRA Homeruns allowed -BBA Walks allowed -SOA Strikeouts by pitchers -E Errors -DP Double Plays -FP Fielding percentage -name Team's full name -park Name of team's home ballpark -attendance Home attendance total -BPF Three-year park factor for batters -PPF Three-year park factor for pitchers -teamIDBR Team ID used by Baseball Reference website -teamIDlahman45 Team ID used in Lahman database version 4.5 -teamIDretro Team ID used by Retrosheet - - -2.9 BattingPost table - -yearID Year -round Level of playoffs -playerID Player ID code -teamID Team -lgID League -G Games -AB At Bats -R Runs -H Hits -2B Doubles -3B Triples -HR Homeruns -RBI Runs Batted In -SB Stolen Bases -CS Caught stealing -BB Base on Balls -SO Strikeouts -IBB Intentional walks -HBP Hit by pitch -SH Sacrifices -SF Sacrifice flies -GIDP Grounded into double plays - - -2.10 PitchingPost table - -playerID Player ID code -yearID Year -round Level of playoffs -teamID Team -lgID League -W Wins -L Losses -G Games -GS Games Started -CG Complete Games -SHO Shutouts -SV Saves -IPOuts Outs Pitched (innings pitched x 3) -H Hits -ER Earned Runs -HR Homeruns -BB Walks -SO Strikeouts -BAOpp Opponents' batting average -ERA Earned Run Average -IBB Intentional Walks -WP Wild Pitches -HBP Batters Hit By Pitch -BK Balks -BFP Batters faced by Pitcher -GF Games Finished -R Runs Allowed -SH Sacrifice Hits allowed -SF Sacrifice Flies allowed -GIDP Grounded into Double Plays - - -2.11 TeamFranchises table - -franchID Franchise ID -franchName Franchise name -active Whetehr team is currently active (Y or N) -NAassoc ID of National Association team franchise played as - - -2.12 FieldingOF table - -playerID Player ID code -yearID Year -stint player's stint (order of appearances within a season) -Glf Games played in left field -Gcf Games played in center field -Grf Games played in right field - - -2.13 ManagersHalf table - -playerID Manager ID code -yearID Year -teamID Team -lgID League -inseason Managerial order. One if the individual managed the team - the entire year. Otherwise denotes where the manager appeared - in the managerial order (1 for first manager, 2 for second, etc.) -half First or second half of season -G Games managed -W Wins -L Losses -rank Team's position in standings for the half - - -2.14 TeamsHalf table - -yearID Year -lgID League -teamID Team -half First or second half of season -divID Division -DivWin Won Division (Y or N) -rank Team's position in standings for the half -G Games played -W Wins -L Losses - - -2.15 Salaries table - -yearID Year -teamID Team -lgID League -playerID Player ID code -salary Salary - - -2.16 SeriesPost table - -yearID Year -round Level of playoffs -teamIDwinner Team ID of the team that won the series -lgIDwinner League ID of the team that won the series -teamIDloser Team ID of the team that lost the series -lgIDloser League ID of the team that lost the series -wins Wins by team that won the series -losses Losses by team that won the series -ties Tie games - -2.17 AwardsManagers table - -playerID Manager ID code -awardID Name of award won -yearID Year -lgID League -tie Award was a tie (Y or N) -notes Notes about the award - - -2.18 AwardsPlayers table - -playerID Player ID code -awardID Name of award won -yearID Year -lgID League -tie Award was a tie (Y or N) -notes Notes about the award - - -2.19 AwardsShareManagers table - -awardID name of award votes were received for -yearID Year -lgID League -playerID Manager ID code -pointsWon Number of points received -pointsMax Maximum numner of points possible -votesFirst Number of first place votes - - -2.20 AwardsSharePlayers table - -awardID name of award votes were received for -yearID Year -lgID League -playerID Player ID code -pointsWon Number of points received -pointsMax Maximum numner of points possible -votesFirst Number of first place votes - - -2.21 FieldingPost table - -playerID Player ID code -yearID Year -teamID Team -lgID League -round Level of playoffs -Pos Position -G Games -GS Games Started -InnOuts Time played in the field expressed as outs -PO Putouts -A Assists -E Errors -DP Double Plays -TP Triple Plays -PB Passed Balls -SB Stolen Bases allowed (by catcher) -CS Caught Stealing (by catcher) - - -2.22 Appearances table - -yearID Year -teamID Team -lgID League -playerID Player ID code -G_all Total games played -GS Games started -G_batting Games in which player batted -G_defense Games in which player appeared on defense -G_p Games as pitcher -G_c Games as catcher -G_1b Games as firstbaseman -G_2b Games as secondbaseman -G_3b Games as thirdbaseman -G_ss Games as shortstop -G_lf Games as leftfielder -G_cf Games as centerfielder -G_rf Games as right fielder -G_of Games as outfielder -G_dh Games as designated hitter -G_ph Games as pinch hitter -G_pr Games as pinch runner - - - -2.23 Schools table -schoolID school ID code -schoolName school name -schoolCity city where school is located -schoolState state where school's city is located -schoolNick nickname for school's baseball team - - - -2.24 CollegePlaying table -playerid Player ID code -schoolID school ID code -year year - - - - -2.25 FieldingOFsplit table -playerID Player ID code -yearID Year -stint player's stint (order of appearances within a season) -teamID Team -lgID League -Pos Position -G Games -GS Games Started -InnOuts Time played in the field expressed as outs -PO Putouts -A Assists -E Errors -DP Double Plays - - - -2.26 Parks table -park.key ballpark ID code -park.name name of ballpark -park.alias alternate names of ballpark -city city -state state -country country - - -2.27 HomeGames table -year.key year -league.key league -team.key team ID -park.key ballpark ID -span.first date of first game played -span.last date of last game played -games total number of games -openings total number of dates played -attendance total attendaance \ No newline at end of file From 67313835c9d99ca9bbf11a3665b2ded388c4059d Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Tue, 7 Nov 2017 09:31:44 -0500 Subject: [PATCH 03/23] complexity builtins with tests --- quantgov/corpora/builtins.py | 55 +++++++++++++----------------------- setup.py | 4 +-- tests/test_corpora.py | 22 +++++++++++++++ 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index f642579..656e0da 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -3,11 +3,16 @@ """ import re import collections +import math import quantgov +from nltk.corpus import wordnet as wn from nltk.corpus import stopwords from textblob import Word +from textblob import TextBlob + +wn.ensure_loaded() commands = {} @@ -110,12 +115,12 @@ class ShannonEntropy(): 'type': re.compile, 'default': re.compile(r'\b\w+\b') } - ) + ), quantgov.utils.CLIArg( flags=('--stopwords', '-sw'), kwargs={ 'help': 'stopwords to ignore', - 'default': set(stopwords.words('english')) + 'default': stopwords.words('english') } ) ] @@ -136,10 +141,10 @@ def process_document(doc, word_pattern, stopwords): if lemma not in stopwords ] counts = collections.Counter(lemmas) - return round(sum( + return doc.index + (round(sum( -(count / len(lemmas) * math.log(count / len(lemmas), 2)) for count in counts.values() - ), 2) + ), 2),) commands['shannon_entropy'] = ShannonEntropy @@ -151,12 +156,13 @@ class ConditionalCounter(): help='Conditional Counter', arguments=[ quantgov.utils.CLIArg( - flags=('--conditional_pattern', '-cp'), + flags=('--pattern'), kwargs={ 'help': 'regular expression defining a "conditional"', 'type': re.compile, 'default': re.compile( - r'\b(if|but|except|provided|when|where|whenever|unless|notwithstanding' + r'\b(if|but|except|provided|when|where' + r'|whenever|unless|notwithstanding' r'|in\s+the\s+event|in\s+no\s+event)\b') } ) @@ -168,8 +174,9 @@ def get_columns(args): return ('conditional_count',) @staticmethod - def process_document(doc, conditional_pattern): - return len(pattern.findall(' '.join((doc.text).splitlines()))) + def process_document(doc, pattern): + return doc.index + (len(pattern.findall( + ' '.join((doc.text).splitlines()))),) commands['conditional_count'] = ConditionalCounter @@ -177,39 +184,17 @@ def process_document(doc, conditional_pattern): class SentenceLength(): - cli = quantgov.utils.CLISpec( - help='Sentence Length', - arguments=[ - quantgov.utils.CLIArg( - flags=('--word_pattern', '-wp'), - kwargs={ - 'help': 'regular expression defining a "word"', - 'type': re.compile, - 'default': re.compile(r'\b\w+\b') - } - ) - quantgov.utils.CLIArg( - flags=('--sentence_pattern', '-sp'), - kwargs={ - 'help': 'regular expression defining a "sentence"', - 'type': re.compile, - 'default': re.compile(r'[A-Z][^\.!?]*[\.!?]') - } - ) - ] - ) - @staticmethod def get_columns(args): return ('sentence_length',) @staticmethod - def process_document(doc, word_pattern, sentence_pattern): - sentences = sentence_pattern.findall(doc) + def process_document(doc): + sentences = TextBlob(doc.text).sentences total_length = 0 for sentence in sentences: - total_length += len(word_pattern.findall(sentence)) - return total_length / len(sentences) + total_length += len(sentence.words) + return doc.index + (round(total_length / len(sentences), 2),) -commands['sentence_length'] = SentenceLength \ No newline at end of file +commands['sentence_length'] = SentenceLength diff --git a/setup.py b/setup.py index 345a464..ac4a84f 100644 --- a/setup.py +++ b/setup.py @@ -60,9 +60,9 @@ def find_version(*file_paths): 'snakemake', ], extras_require={ - 'testing': ['pytest-flake8'] + 'testing': ['pytest-flake8'], 'complexity': [ - 'textblob,' + 'textblob', 'nltk' ] }, diff --git a/tests/test_corpora.py b/tests/test_corpora.py index 5e89460..4362f9f 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -126,3 +126,25 @@ def test_termcount_multiple_with_label(): 'lorem', 'dolor sit', '--total_label', 'bothofem'], ) assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n' + + +def test_shannon_entropy(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)], + ) + assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n' + + +def test_conditionalcount(): + output = check_output( + ['quantgov', 'corpus', 'conditional_count', str(PSEUDO_CORPUS_PATH)], + ) + assert output == 'file,conditional_count\n1,0\n2,0\n' + + +def test_sentencelength(): + output = check_output( + ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)], + ) + assert output == ('file,sentence_length\n' + '1,9.54\n2,8.16\n') From 1aabb9a4e68acf745deb564a15a896525130ac25 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Wed, 8 Nov 2017 09:04:40 -0500 Subject: [PATCH 04/23] code review updates --- quantgov/corpora/builtins.py | 87 ++++++++++++++++++++++-------------- setup.py | 5 +-- tests/test_corpora.py | 3 +- 3 files changed, 55 insertions(+), 40 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 656e0da..613347d 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -7,12 +7,10 @@ import quantgov -from nltk.corpus import wordnet as wn -from nltk.corpus import stopwords -from textblob import Word -from textblob import TextBlob +import nltk.corpus +import textblob -wn.ensure_loaded() +nltk.corpus.wordnet.ensure_loaded() commands = {} @@ -104,7 +102,7 @@ def process_document(doc, terms, pattern, total_label): class ShannonEntropy(): - + LEMMAS = {} cli = quantgov.utils.CLISpec( help='Shannon Entropy', arguments=[ @@ -120,11 +118,17 @@ class ShannonEntropy(): flags=('--stopwords', '-sw'), kwargs={ 'help': 'stopwords to ignore', - 'default': stopwords.words('english') + 'default': nltk.corpus.stopwords.words('english') + } + ), + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 } ) ] - ) @staticmethod @@ -132,11 +136,11 @@ def get_columns(args): return ('shannon_entropy',) @staticmethod - def process_document(doc, word_pattern, stopwords): + def process_document(doc, word_pattern, stopwords, precision): words = word_pattern.findall(doc.text) lemmas = [ lemma for lemma in ( - Word(word).lemmatize() for word in words + ShannonEntropy.lemmatize(word) for word in words ) if lemma not in stopwords ] @@ -144,29 +148,32 @@ def process_document(doc, word_pattern, stopwords): return doc.index + (round(sum( -(count / len(lemmas) * math.log(count / len(lemmas), 2)) for count in counts.values() - ), 2),) + ), precision),) + + def lemmatize(word): + if word in ShannonEntropy.LEMMAS: + lemma = ShannonEntropy.LEMMAS[word] + else: + lemma = textblob.Word(word).lemmatize() + ShannonEntropy.LEMMAS[word] = lemma + return lemma commands['shannon_entropy'] = ShannonEntropy class ConditionalCounter(): - cli = quantgov.utils.CLISpec( - help='Conditional Counter', - arguments=[ - quantgov.utils.CLIArg( - flags=('--pattern'), - kwargs={ - 'help': 'regular expression defining a "conditional"', - 'type': re.compile, - 'default': re.compile( - r'\b(if|but|except|provided|when|where' - r'|whenever|unless|notwithstanding' - r'|in\s+the\s+event|in\s+no\s+event)\b') - } - ) - ] + help=('Count conditional words and phrases. Included terms are: ' + ' "if", "but", "except", "provided", "when", "where", ' + '"whenever", "unless", "notwithstanding", "in the event", ' + 'and "in no event"'), + arguments=[] + ) + pattern = re.compile( + r'\b(if|but|except|provided|when|where' + r'|whenever|unless|notwithstanding' + r'|in\s+the\s+event|in\s+no\s+event)\b' ) @staticmethod @@ -174,8 +181,8 @@ def get_columns(args): return ('conditional_count',) @staticmethod - def process_document(doc, pattern): - return doc.index + (len(pattern.findall( + def process_document(doc): + return doc.index + (len(ConditionalCounter.pattern.findall( ' '.join((doc.text).splitlines()))),) @@ -184,17 +191,29 @@ def process_document(doc, pattern): class SentenceLength(): + cli = quantgov.utils.CLISpec( + help='Shannon Entropy', + arguments=[ + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + @staticmethod def get_columns(args): return ('sentence_length',) @staticmethod - def process_document(doc): - sentences = TextBlob(doc.text).sentences - total_length = 0 - for sentence in sentences: - total_length += len(sentence.words) - return doc.index + (round(total_length / len(sentences), 2),) + def process_document(doc, precision): + sentences = textblob.TextBlob(doc.text).sentences + return doc.index + (round(sum(len( + sentence.words) for sentence in sentences) / + len(sentences), precision),) commands['sentence_length'] = SentenceLength diff --git a/setup.py b/setup.py index ac4a84f..979f092 100644 --- a/setup.py +++ b/setup.py @@ -61,10 +61,7 @@ def find_version(*file_paths): ], extras_require={ 'testing': ['pytest-flake8'], - 'complexity': [ - 'textblob', - 'nltk' - ] + 'complexity': ['textblob'] }, entry_points={ 'console_scripts': [ diff --git a/tests/test_corpora.py b/tests/test_corpora.py index 4362f9f..46e48e7 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -146,5 +146,4 @@ def test_sentencelength(): output = check_output( ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)], ) - assert output == ('file,sentence_length\n' - '1,9.54\n2,8.16\n') + assert output == 'file,sentence_length\n1,9.54\n2,8.16\n' From 43f4d3761b3d4bd00504e82cafaf4b7dca02ab46 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Wed, 8 Nov 2017 09:35:15 -0500 Subject: [PATCH 05/23] option tests --- quantgov/corpora/builtins.py | 10 +++++----- tests/test_corpora.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 613347d..9a1b662 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -148,7 +148,7 @@ def process_document(doc, word_pattern, stopwords, precision): return doc.index + (round(sum( -(count / len(lemmas) * math.log(count / len(lemmas), 2)) for count in counts.values() - ), precision),) + ), int(precision)),) def lemmatize(word): if word in ShannonEntropy.LEMMAS: @@ -178,7 +178,7 @@ class ConditionalCounter(): @staticmethod def get_columns(args): - return ('conditional_count',) + return ('conditionals',) @staticmethod def process_document(doc): @@ -186,13 +186,13 @@ def process_document(doc): ' '.join((doc.text).splitlines()))),) -commands['conditional_count'] = ConditionalCounter +commands['count_conditionals'] = ConditionalCounter class SentenceLength(): cli = quantgov.utils.CLISpec( - help='Shannon Entropy', + help='Sentence Length', arguments=[ quantgov.utils.CLIArg( flags=('--precision'), @@ -213,7 +213,7 @@ def process_document(doc, precision): sentences = textblob.TextBlob(doc.text).sentences return doc.index + (round(sum(len( sentence.words) for sentence in sentences) / - len(sentences), precision),) + len(sentences), int(precision)),) commands['sentence_length'] = SentenceLength diff --git a/tests/test_corpora.py b/tests/test_corpora.py index 46e48e7..bfd1646 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -135,11 +135,27 @@ def test_shannon_entropy(): assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n' +def test_shannon_entropy_no_stopwords(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + '--stopwords', 'None'], + ) + assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n' + + +def test_shannon_entropy_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n' + + def test_conditionalcount(): output = check_output( - ['quantgov', 'corpus', 'conditional_count', str(PSEUDO_CORPUS_PATH)], + ['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)], ) - assert output == 'file,conditional_count\n1,0\n2,0\n' + assert output == 'file,conditionals\n1,0\n2,0\n' def test_sentencelength(): @@ -147,3 +163,11 @@ def test_sentencelength(): ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)], ) assert output == 'file,sentence_length\n1,9.54\n2,8.16\n' + + +def test_sentencelength_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n' From d1213981567d550256f40726ea781fb57ea633ce Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Wed, 8 Nov 2017 14:27:29 -0500 Subject: [PATCH 06/23] added nltk requirement in setup.py --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 979f092..a32396a 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,10 @@ def find_version(*file_paths): ], extras_require={ 'testing': ['pytest-flake8'], - 'complexity': ['textblob'] + 'builtins': [ + 'textblob', + 'nltk' + ] }, entry_points={ 'console_scripts': [ From a5c15c9e80965a35b1fe5d6aefb498e34944f967 Mon Sep 17 00:00:00 2001 From: jnelson16 Date: Thu, 9 Nov 2017 09:55:16 -0500 Subject: [PATCH 07/23] add pip install to .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 0173ffb..6ccd1e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ python: - '3.6' install: - pip install ".[testing]" +- pip install ".[builtins]" script: pytest deploy: provider: pypi From c1ebb763a112c7b5ace14c088400893d105c9199 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 11:24:11 -0500 Subject: [PATCH 08/23] nltk fixes --- quantgov/corpora/builtins.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 9a1b662..a3350cc 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -7,10 +7,15 @@ import quantgov +import nltk import nltk.corpus import textblob -nltk.corpus.wordnet.ensure_loaded() +try: + nltk.corpus.wordnet.ensure_loaded() +except LookupError: + nltk.download('wordnet') + nltk.corpus.wordnet.ensure_loaded() commands = {} From d731de1deb770817676737df7ad9eff428ea0ed8 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 11:32:48 -0500 Subject: [PATCH 09/23] another nltk fix --- quantgov/corpora/builtins.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index a3350cc..b25846c 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -108,6 +108,11 @@ def process_document(doc, terms, pattern, total_label): class ShannonEntropy(): LEMMAS = {} + try: + nltk.corpus.stopwords.ensure_loaded() + except LookupError: + nltk.download('stopwords') + nltk.corpus.stopwords.ensure_loaded() cli = quantgov.utils.CLISpec( help='Shannon Entropy', arguments=[ From 1b0e35af857af9388a98b76ac617e764886b0c07 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 11:38:58 -0500 Subject: [PATCH 10/23] last nltk fix? --- quantgov/corpora/builtins.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index b25846c..4020fc9 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -17,6 +17,18 @@ nltk.download('wordnet') nltk.corpus.wordnet.ensure_loaded() +try: + nltk.corpus.stopwords.ensure_loaded() +except LookupError: + nltk.download('stopwords') + nltk.corpus.stopwords.ensure_loaded() + +try: + nltk.corpus.punkt.ensure_loaded() +except: + nltk.download('punkt') + nltk.corpus.punkt.ensure_loaded() + commands = {} @@ -108,11 +120,6 @@ def process_document(doc, terms, pattern, total_label): class ShannonEntropy(): LEMMAS = {} - try: - nltk.corpus.stopwords.ensure_loaded() - except LookupError: - nltk.download('stopwords') - nltk.corpus.stopwords.ensure_loaded() cli = quantgov.utils.CLISpec( help='Shannon Entropy', arguments=[ From b1e142ed3923ee7193d9581bb1cd8aad560da2a1 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 11:44:16 -0500 Subject: [PATCH 11/23] you know the drill --- quantgov/corpora/builtins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 4020fc9..dfc35b3 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -24,10 +24,10 @@ nltk.corpus.stopwords.ensure_loaded() try: - nltk.corpus.punkt.ensure_loaded() + nltk.tokenize.punkt.ensure_loaded() except: nltk.download('punkt') - nltk.corpus.punkt.ensure_loaded() + nltk.tokenize.punkt.ensure_loaded() commands = {} From 41b17f886320e70f1049edeb11ced1f124d5a86a Mon Sep 17 00:00:00 2001 From: jnelson16 Date: Fri, 10 Nov 2017 14:47:56 -0500 Subject: [PATCH 12/23] Update .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6ccd1e7..99eb3ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: install: - pip install ".[testing]" - pip install ".[builtins]" +- python -m nltk.downloader punkt stopwords wordnet script: pytest deploy: provider: pypi From 842204b61d67ab69d6a928498b7c1ffa0029ea3d Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 14:50:15 -0500 Subject: [PATCH 13/23] nltk troubles --- quantgov/corpora/builtins.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index dfc35b3..463cdc9 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -23,12 +23,6 @@ nltk.download('stopwords') nltk.corpus.stopwords.ensure_loaded() -try: - nltk.tokenize.punkt.ensure_loaded() -except: - nltk.download('punkt') - nltk.tokenize.punkt.ensure_loaded() - commands = {} From 3f95a988eb6fadf80c14e78de3179f537bd6f438 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 16:30:16 -0500 Subject: [PATCH 14/23] some final cleanup --- quantgov/corpora/builtins.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 463cdc9..f6e73bc 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -7,22 +7,9 @@ import quantgov -import nltk import nltk.corpus import textblob -try: - nltk.corpus.wordnet.ensure_loaded() -except LookupError: - nltk.download('wordnet') - nltk.corpus.wordnet.ensure_loaded() - -try: - nltk.corpus.stopwords.ensure_loaded() -except LookupError: - nltk.download('stopwords') - nltk.corpus.stopwords.ensure_loaded() - commands = {} @@ -113,7 +100,7 @@ def process_document(doc, terms, pattern, total_label): class ShannonEntropy(): - LEMMAS = {} + lemmas = {} cli = quantgov.utils.CLISpec( help='Shannon Entropy', arguments=[ @@ -162,11 +149,11 @@ def process_document(doc, word_pattern, stopwords, precision): ), int(precision)),) def lemmatize(word): - if word in ShannonEntropy.LEMMAS: - lemma = ShannonEntropy.LEMMAS[word] + if word in ShannonEntropy.lemmas: + lemma = ShannonEntropy.lemmas[word] else: lemma = textblob.Word(word).lemmatize() - ShannonEntropy.LEMMAS[word] = lemma + ShannonEntropy.lemmas[word] = lemma return lemma From f9bd2204599b59c5101f3768626b66dbf90d18ab Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 10 Nov 2017 16:35:14 -0500 Subject: [PATCH 15/23] if it aint broke... --- quantgov/corpora/builtins.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index f6e73bc..a94e594 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -10,6 +10,12 @@ import nltk.corpus import textblob +try: + nltk.corpus.wordnet.ensure_loaded() +except LookupError: + nltk.download('wordnet') + nltk.corpus.wordnet.ensure_loaded() + commands = {} From 61312c30e7add0a9fa144ee9ada7660cf4917aa5 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Mon, 13 Nov 2017 09:37:55 -0500 Subject: [PATCH 16/23] textblob sentiment --- quantgov/corpora/builtins.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index a94e594..3efd4fc 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -221,3 +221,43 @@ def process_document(doc, precision): commands['sentence_length'] = SentenceLength + + +class SentimentAnalysis(): + + cli = quantgov.utils.CLISpec( + help='Performs sentiment analysis on the text', + arguments=[ + quantgov.utils.CLIArg( + flags=('--analyzer'), + kwargs={ + 'help': 'which analyzer to use', + 'default': 'textblob' + } + ), + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + if args['analyzer'] == 'textblob': + return ('sentiment_polarity', 'sentiment_subjectivity',) + else: + raise NotImplementedError + + @staticmethod + def process_document(doc, analyzer, precision): + if analyzer == 'textblob': + sentiment = textblob.TextBlob(doc.text) + return doc.index + (round(sentiment.polarity, int(precision)), + round(sentiment.subjectivity, int(precision)),) + + +commands['sentiment_analysis'] = SentimentAnalysis From 0bdff3df5b942897121465b8e2d0bb5246d911b3 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Mon, 13 Nov 2017 16:51:25 -0500 Subject: [PATCH 17/23] tests and error raising --- quantgov/corpora/builtins.py | 39 ++++++++++++++++++++++++++---------- quantgov/corpora/utils.py | 15 ++++++++++++++ tests/test_corpora.py | 17 ++++++++++++++++ 3 files changed, 60 insertions(+), 11 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 3efd4fc..62aa0e3 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -7,14 +7,23 @@ import quantgov -import nltk.corpus -import textblob +try: + import nltk.corpus + NLTK = True +except ModuleNotFoundError: + NLTK = None try: - nltk.corpus.wordnet.ensure_loaded() -except LookupError: - nltk.download('wordnet') - nltk.corpus.wordnet.ensure_loaded() + import textblob +except ModuleNotFoundError: + textblob = None + +if NLTK: + try: + nltk.corpus.wordnet.ensure_loaded() + except LookupError: + nltk.download('wordnet') + nltk.corpus.wordnet.ensure_loaded() commands = {} @@ -122,7 +131,7 @@ class ShannonEntropy(): flags=('--stopwords', '-sw'), kwargs={ 'help': 'stopwords to ignore', - 'default': nltk.corpus.stopwords.words('english') + 'default': nltk.corpus.stopwords.words('english') if NLTK else None } ), quantgov.utils.CLIArg( @@ -140,7 +149,10 @@ def get_columns(args): return ('shannon_entropy',) @staticmethod - def process_document(doc, word_pattern, stopwords, precision): + @quantgov.corpora.utils.check_nltk + @quantgov.corpora.utils.check_textblob + def process_document(doc, word_pattern, precision, stopwords, + textblob=textblob, nltk=NLTK): words = word_pattern.findall(doc.text) lemmas = [ lemma for lemma in ( @@ -213,7 +225,9 @@ def get_columns(args): return ('sentence_length',) @staticmethod - def process_document(doc, precision): + @quantgov.corpora.utils.check_nltk + @quantgov.corpora.utils.check_textblob + def process_document(doc, precision, textblob=textblob, nltk=NLTK): sentences = textblob.TextBlob(doc.text).sentences return doc.index + (round(sum(len( sentence.words) for sentence in sentences) / @@ -253,11 +267,14 @@ def get_columns(args): raise NotImplementedError @staticmethod - def process_document(doc, analyzer, precision): + @quantgov.corpora.utils.check_nltk + @quantgov.corpora.utils.check_textblob + def process_document(doc, analyzer, precision, + textblob=textblob, nltk=NLTK): if analyzer == 'textblob': sentiment = textblob.TextBlob(doc.text) return doc.index + (round(sentiment.polarity, int(precision)), - round(sentiment.subjectivity, int(precision)),) + round(sentiment.subjectivity, int(precision)),) commands['sentiment_analysis'] = SentimentAnalysis diff --git a/quantgov/corpora/utils.py b/quantgov/corpora/utils.py index 136929f..1aa10f9 100644 --- a/quantgov/corpora/utils.py +++ b/quantgov/corpora/utils.py @@ -3,6 +3,7 @@ """ import sys +from decorator import decorator from pathlib import Path @@ -14,3 +15,17 @@ def load_driver(corpus): from driver import driver sys.path.pop(0) return driver + + +@decorator +def check_nltk(func, *args, **kwargs): + if args[-1] is None: + raise RuntimeError('Must install NLTK to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_textblob(func, *args, **kwargs): + if args[-2] is None: + raise RuntimeError('Must install textblob to use {}'.format(func)) + return func(*args, **kwargs) diff --git a/tests/test_corpora.py b/tests/test_corpora.py index bfd1646..6863e36 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -171,3 +171,20 @@ def test_sentencelength_4decimals(): '--precision', '4'], ) assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n' + + +def test_sentiment_analysis(): + output = check_output( + ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)], + ) + assert output == ('file,sentiment_polarity,sentiment_subjectivity' + '\n1,0.0,0.0\n2,0.0,0.0\n') + + +def test_sentiment_analysis_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == ('file,sentiment_polarity,sentiment_subjectivity' + '\n1,0.0,0.0\n2,0.0,0.0\n') From 7c7d1d2fdde0920b64456366eb06bb0f1f1206a6 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 17 Nov 2017 11:33:21 -0500 Subject: [PATCH 18/23] fixed install req --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a32396a..55d3180 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,8 @@ def find_version(*file_paths): 'testing': ['pytest-flake8'], 'builtins': [ 'textblob', - 'nltk' + 'nltk', + 'decorator' ] }, entry_points={ From bdac27495aca0dcb5454a8d0d996a002b75f37c3 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Fri, 17 Nov 2017 11:51:40 -0500 Subject: [PATCH 19/23] pep8 fixes --- quantgov/corpora/builtins.py | 5 +++-- setup.cfg | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 62aa0e3..2e27dd8 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -131,7 +131,8 @@ class ShannonEntropy(): flags=('--stopwords', '-sw'), kwargs={ 'help': 'stopwords to ignore', - 'default': nltk.corpus.stopwords.words('english') if NLTK else None + 'default': (nltk.corpus.stopwords.words('english') + if NLTK else None) } ), quantgov.utils.CLIArg( @@ -152,7 +153,7 @@ def get_columns(args): @quantgov.corpora.utils.check_nltk @quantgov.corpora.utils.check_textblob def process_document(doc, word_pattern, precision, stopwords, - textblob=textblob, nltk=NLTK): + textblob=textblob, nltk=NLTK): words = word_pattern.findall(doc.text) lemmas = [ lemma for lemma in ( diff --git a/setup.cfg b/setup.cfg index aab50ea..aa335ad 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ universal = 1 [tool:pytest] addopts = --flake8 flake8-ignore = - *.py W391 W503 + *.py W391 W503 F821 */__init__.py F401 tests/* F401 E402 From a20bd1235b7f959073d37c2ad6413cf9af946968 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Tue, 28 Nov 2017 13:30:27 -0500 Subject: [PATCH 20/23] code review updates --- quantgov/corpora/builtins.py | 48 +++++++++++++++++++++++++++--------- quantgov/corpora/utils.py | 15 ----------- setup.cfg | 2 +- setup.py | 4 +-- 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 2e27dd8..d4da9b5 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -5,6 +5,7 @@ import collections import math +from decorator import decorator import quantgov try: @@ -28,6 +29,20 @@ commands = {} +@decorator +def check_nltk(func, *args, **kwargs): + if NLTK is None: + raise RuntimeError('Must install NLTK to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_textblob(func, *args, **kwargs): + if textblob is None: + raise RuntimeError('Must install textblob to use {}'.format(func)) + return func(*args, **kwargs) + + class WordCounter(): cli = quantgov.utils.CLISpec( @@ -228,11 +243,17 @@ def get_columns(args): @staticmethod @quantgov.corpora.utils.check_nltk @quantgov.corpora.utils.check_textblob - def process_document(doc, precision, textblob=textblob, nltk=NLTK): + def process_document(doc, precision): sentences = textblob.TextBlob(doc.text).sentences - return doc.index + (round(sum(len( - sentence.words) for sentence in sentences) / - len(sentences), int(precision)),) + # Allows for rounding to a specified number of decimals + if precision: + return doc.index + (round(sum(len( + sentence.words) for sentence in sentences) / + len(sentences), int(precision)),) + else: + return doc.index + (sum(len( + sentence.words) for sentence in sentences) / + len(sentences),) commands['sentence_length'] = SentenceLength @@ -244,9 +265,9 @@ class SentimentAnalysis(): help='Performs sentiment analysis on the text', arguments=[ quantgov.utils.CLIArg( - flags=('--analyzer'), + flags=('--backend'), kwargs={ - 'help': 'which analyzer to use', + 'help': 'which program to use for the analysis', 'default': 'textblob' } ), @@ -270,12 +291,17 @@ def get_columns(args): @staticmethod @quantgov.corpora.utils.check_nltk @quantgov.corpora.utils.check_textblob - def process_document(doc, analyzer, precision, - textblob=textblob, nltk=NLTK): - if analyzer == 'textblob': + def process_document(doc, backend, precision): + if backend == 'textblob': sentiment = textblob.TextBlob(doc.text) - return doc.index + (round(sentiment.polarity, int(precision)), - round(sentiment.subjectivity, int(precision)),) + # Allows for rounding to a specified number of decimals + if precision: + return (doc.index + + (round(sentiment.polarity, int(precision)), + round(sentiment.subjectivity, int(precision)),)) + else: + return (doc.index + + (sentiment.polarity, sentiment.subjectivity,)) commands['sentiment_analysis'] = SentimentAnalysis diff --git a/quantgov/corpora/utils.py b/quantgov/corpora/utils.py index 1aa10f9..136929f 100644 --- a/quantgov/corpora/utils.py +++ b/quantgov/corpora/utils.py @@ -3,7 +3,6 @@ """ import sys -from decorator import decorator from pathlib import Path @@ -15,17 +14,3 @@ def load_driver(corpus): from driver import driver sys.path.pop(0) return driver - - -@decorator -def check_nltk(func, *args, **kwargs): - if args[-1] is None: - raise RuntimeError('Must install NLTK to use {}'.format(func)) - return func(*args, **kwargs) - - -@decorator -def check_textblob(func, *args, **kwargs): - if args[-2] is None: - raise RuntimeError('Must install textblob to use {}'.format(func)) - return func(*args, **kwargs) diff --git a/setup.cfg b/setup.cfg index aa335ad..aab50ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ universal = 1 [tool:pytest] addopts = --flake8 flake8-ignore = - *.py W391 W503 F821 + *.py W391 W503 */__init__.py F401 tests/* F401 E402 diff --git a/setup.py b/setup.py index 55d3180..3d424b1 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def find_version(*file_paths): packages=find_packages( exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=[ + 'decorator', 'joblib', 'pandas', 'requests', @@ -61,10 +62,9 @@ def find_version(*file_paths): ], extras_require={ 'testing': ['pytest-flake8'], - 'builtins': [ + 'nlp': [ 'textblob', 'nltk', - 'decorator' ] }, entry_points={ From 187640724c0e3c35550da51172748dd815fff505 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Tue, 28 Nov 2017 13:35:08 -0500 Subject: [PATCH 21/23] fix travis file --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 99eb3ac..75cc0ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ python: - '3.6' install: - pip install ".[testing]" -- pip install ".[builtins]" +- pip install ".[nlp]" - python -m nltk.downloader punkt stopwords wordnet script: pytest deploy: From 981e075ee42f23f9df9042b05aad6ba95b331680 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Tue, 28 Nov 2017 13:48:39 -0500 Subject: [PATCH 22/23] import fixes --- quantgov/corpora/builtins.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index d4da9b5..54cd2f7 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -11,12 +11,12 @@ try: import nltk.corpus NLTK = True -except ModuleNotFoundError: +except ImportError: NLTK = None try: import textblob -except ModuleNotFoundError: +except ImportError: textblob = None if NLTK: @@ -165,8 +165,8 @@ def get_columns(args): return ('shannon_entropy',) @staticmethod - @quantgov.corpora.utils.check_nltk - @quantgov.corpora.utils.check_textblob + @check_nltk + @check_textblob def process_document(doc, word_pattern, precision, stopwords, textblob=textblob, nltk=NLTK): words = word_pattern.findall(doc.text) @@ -241,8 +241,8 @@ def get_columns(args): return ('sentence_length',) @staticmethod - @quantgov.corpora.utils.check_nltk - @quantgov.corpora.utils.check_textblob + @check_nltk + @check_textblob def process_document(doc, precision): sentences = textblob.TextBlob(doc.text).sentences # Allows for rounding to a specified number of decimals @@ -289,8 +289,8 @@ def get_columns(args): raise NotImplementedError @staticmethod - @quantgov.corpora.utils.check_nltk - @quantgov.corpora.utils.check_textblob + @check_nltk + @check_textblob def process_document(doc, backend, precision): if backend == 'textblob': sentiment = textblob.TextBlob(doc.text) From f0f43f3da475e7eddc801465bf94f935c6bf2f31 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Tue, 28 Nov 2017 13:55:56 -0500 Subject: [PATCH 23/23] small fix --- quantgov/corpora/builtins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 54cd2f7..bb9b9b6 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -283,7 +283,7 @@ class SentimentAnalysis(): @staticmethod def get_columns(args): - if args['analyzer'] == 'textblob': + if args['backend'] == 'textblob': return ('sentiment_polarity', 'sentiment_subjectivity',) else: raise NotImplementedError