diff --git a/cross_section/README.md b/cross_section/README.md new file mode 100644 index 0000000..12159e2 --- /dev/null +++ b/cross_section/README.md @@ -0,0 +1,33 @@ +### Datasets information + +Here is an information table for the dataset: + +| name | description | size | source | Features | +| ------------------------ | ----------------------------------------------------------- | ----- | --------------------------------------------------------- | ------------------------------------------------------------ | +| cities_brazil.csv | Population of cities in 2023 Brazil | 18kb | https://worldpopulationreview.com/countries/cities/brazil | | +| cities_us.csv | Population of cities in United States | 37kb | https://worldpopulationreview.com/us-cities | Including population data in 2023, 2022, 2020 census, 2010 census | +| forbes-billionaires.csv* | The world's richest 2000 people in 2020 according to Forbes | 790kb | https://www.forbes.com/billionaires/ | | +| forbes-global2000.csv* | The world's largest 2000 firms in 2020 according to Forbes | 118kb | https://www.forbes.com/lists/global2000/ | Variables measuring the firm size: sales, profits, assets, market value | +| | | | | | + +*These two datasets are generated using the file ``webscrape_forbes.ipynb`` with the Forbes API. + +### Instructions on fetching the data for use in a lecture + +There are many different method of fetching and using the data. + +In ``Python`` environment you can follow: + +Step 1: find the url code by clicking the file -->> view raw -->> copy the url path + +Step 2: paste the url path in the following code + +``` +import pandas as pd + +url = "" +pd.read_csv(url) # for csv files +``` + + + diff --git a/cross_section/cities_brazil.csv b/cross_section/cities_brazil.csv new file mode 100644 index 0000000..d6fd68d --- /dev/null +++ b/cross_section/cities_brazil.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:127c77ad01d5c6fc6225222698bee6a0f2fbf9e6dbef9ccff96e6cc92d722e39 +size 17888 diff --git a/cross_section/cities_us.csv b/cross_section/cities_us.csv new file mode 100644 index 0000000..6d0b47c --- /dev/null +++ b/cross_section/cities_us.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa831de3a9e318712fc688e62908fb353eb18497b729fc8b6fda17b2653ceebb +size 48126 diff --git a/cross_section/forbes-billionaires.csv b/cross_section/forbes-billionaires.csv new file mode 100644 index 0000000..23298b8 --- /dev/null +++ b/cross_section/forbes-billionaires.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45618eb7ccab9a132c87005d74bfbd0d18ffb7ff599f619f85083edad0149d63 +size 794366 diff --git a/cross_section/forbes-global2000.csv b/cross_section/forbes-global2000.csv new file mode 100644 index 0000000..1ec928f --- /dev/null +++ b/cross_section/forbes-global2000.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c7af7c1ee77064094b17327150e27dd3c56147f8e343526681e35bcee3e4c6 +size 118381 diff --git a/cross_section/webscrape_forbes.ipynb b/cross_section/webscrape_forbes.ipynb new file mode 100644 index 0000000..d947bd1 --- /dev/null +++ b/cross_section/webscrape_forbes.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# parse_forbeslists\n", + "\n", + "This notebook \n", + "- parses Forbes richest lists and Forbes global 2000 list and\n", + "- saves them as csv files." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "from pandas import DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Forbes lists\n", + "lists = [ \n", + " { 'type': 'person', 'year': 2020, 'uri': 'billionaires' }, # World richest\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'forbes-400' }, # American richest 400\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'hong-kong-billionaires' }, # Hong Kong richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'australia-billionaires' }, # Australia richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'china-billionaires' }, # China richest 400\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'taiwan-billionaires' }, # Taiwan richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'india-billionaires' }, # India richest 100\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'japan-billionaires' }, # Japan richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'africa-billionaires' }, # Africa richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'korea-billionaires' }, # Korea richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'malaysia-billionaires' }, # Malaysia richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'philippines-billionaires' }, # Philippines richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'singapore-billionaires' }, # Singapore richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'indonesia-billionaires' }, # Indonesia richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'thailand-billionaires' }, # Thailand richest 50\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'self-made-women' }, # American richest self-made women\n", + " # { 'type': 'person', 'year': 2018, 'uri': 'richest-in-tech' }, # tech richest\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'hedge-fund-managers' }, # hedge fund highest-earning\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'powerful-people' }, # world powerful\n", + " # { 'type': 'person', 'year': 2020, 'uri': 'power-women' }, # world powerful women\n", + " # { 'type': 'person', 'year': 0, 'uri': 'rtb' }, # real-time world billionaires\n", + " # { 'type': 'person', 'year': 0, 'uri': 'rtrl' }, # real-time American richest 400\n", + "]\n", + "\n", + "url = 'http://www.forbes.com/ajax/list/data'\n", + "SOURCES_DIR = Path('./sources')\n", + "\n", + "for forbes_list in lists:\n", + " response = requests.get(url, params=forbes_list)\n", + "\n", + " if not SOURCES_DIR.exists():\n", + " SOURCES_DIR.mkdir(exist_ok=True, parents=True)\n", + "\n", + " DataFrame(response.json()).to_csv('forbes-{}.csv'.format(forbes_list['uri']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then Forbes Global 2000 for the largest 2000 firms globally." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " \"accept\": \"application/json, text/plain, */*\",\n", + " \"referer\": \"https://www.forbes.com/global2000/\",\n", + " \"user-agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36\",\n", + "}\n", + "\n", + "cookies = {\n", + " \"notice_behavior\": \"expressed,eu\",\n", + " \"notice_gdpr_prefs\": \"0,1,2:1a8b5228dd7ff0717196863a5d28ce6c\",\n", + "}\n", + "\n", + "api_url = \"https://www.forbes.com/forbesapi/org/global2000/2020/position/true.json?limit=2000\"\n", + "response = requests.get(api_url, headers=headers, cookies=cookies).json()\n", + "\n", + "sample_table = [\n", + " [\n", + " item[\"organizationName\"],\n", + " item[\"country\"],\n", + " item[\"revenue\"],\n", + " item[\"profits\"],\n", + " item[\"assets\"],\n", + " item[\"marketValue\"]\n", + " ] for item in\n", + " sorted(response[\"organizationList\"][\"organizationsLists\"], key=lambda k: k[\"position\"])\n", + "]\n", + "\n", + "dfff = pd.DataFrame(sample_table, columns=[\"Company\", \"Country\", \"Sales\", \"Profits\", \"Assets\", \"Market Value\"])\n", + "dfff.to_csv('forbes-global2000.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}