QuantEcon · shlff · May 3, 2023 · Apr 16, 2023 · Apr 17, 2023 · Apr 29, 2023
diff --git a/cross_section/README.md b/cross_section/README.md
@@ -0,0 +1,33 @@
+### Datasets information
+
+Here is an information table for the dataset:
+
+| name                     | description                                                 | size  | source                                                    | Features                                                     |
+| ------------------------ | ----------------------------------------------------------- | ----- | --------------------------------------------------------- | ------------------------------------------------------------ |
+| cities_brazil.csv        | Population of cities in 2023 Brazil                         | 18kb  | https://worldpopulationreview.com/countries/cities/brazil |                                                              |
+| cities_us.csv            | Population of cities in United States                       | 37kb  | https://worldpopulationreview.com/us-cities               | Including population data in 2023, 2022, 2020 census, 2010 census |
+| forbes-billionaires.csv* | The world's richest 2000 people in 2020 according to Forbes | 790kb | https://www.forbes.com/billionaires/                      |                                                              |
+| forbes-global2000.csv*   | The world's largest 2000 firms in 2020 according to Forbes  | 118kb | https://www.forbes.com/lists/global2000/                  | Variables measuring the firm size: sales, profits, assets, market value |
+|                          |                                                             |       |                                                           |                                                              |
+
+*These two datasets are generated using the file ``webscrape_forbes.ipynb`` with the Forbes API.
+
+### Instructions on fetching the data for use in a lecture
+
+There are many different method of fetching and using the data.
+
+In ``Python`` environment you can follow:
+
+Step 1: find the url code by clicking the file -->> view raw -->> copy the url path
+
+Step 2: paste the url path in the following code
+
+```
+import pandas as pd
+
+url = "<Paste the url path here>"
+pd.read_csv(url)                              # for csv files
+```
+
+
+
diff --git a/cross_section/cities_brazil.csv b/cross_section/cities_brazil.csv
diff --git a/cross_section/cities_us.csv b/cross_section/cities_us.csv
diff --git a/cross_section/forbes-billionaires.csv b/cross_section/forbes-billionaires.csv
diff --git a/cross_section/forbes-global2000.csv b/cross_section/forbes-global2000.csv
diff --git a/cross_section/webscrape_forbes.ipynb b/cross_section/webscrape_forbes.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# parse_forbeslists\n",
+    "\n",
+    "This notebook \n",
+    "- parses Forbes richest lists and Forbes global 2000 list and\n",
+    "- saves them as csv files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from pandas import DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Forbes lists\n",
+    "lists = [ \n",
+    "    { 'type': 'person', 'year': 2020, 'uri': 'billionaires' },                  # World richest\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'forbes-400' },                  # American richest 400\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'hong-kong-billionaires' },      # Hong Kong richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'australia-billionaires' },      # Australia richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'china-billionaires' },          # China richest 400\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'taiwan-billionaires' },         # Taiwan richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'india-billionaires' },          # India richest 100\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'japan-billionaires' },          # Japan richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'africa-billionaires' },         # Africa richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'korea-billionaires' },          # Korea richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'malaysia-billionaires' },       # Malaysia richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'philippines-billionaires' },    # Philippines richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'singapore-billionaires' },      # Singapore richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'indonesia-billionaires' },      # Indonesia richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'thailand-billionaires' },       # Thailand richest 50\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'self-made-women' },             # American richest self-made women\n",
+    "    # { 'type': 'person', 'year': 2018, 'uri': 'richest-in-tech' },             # tech richest\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'hedge-fund-managers' },         # hedge fund highest-earning\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'powerful-people' },             # world powerful\n",
+    "    # { 'type': 'person', 'year': 2020, 'uri': 'power-women' },                 # world powerful women\n",
+    "    # { 'type': 'person', 'year': 0, 'uri': 'rtb' },                            # real-time world billionaires\n",
+    "    # { 'type': 'person', 'year': 0, 'uri': 'rtrl' },                           # real-time American richest 400\n",
+    "]\n",
+    "\n",
+    "url = 'http://www.forbes.com/ajax/list/data'\n",
+    "SOURCES_DIR = Path('./sources')\n",
+    "\n",
+    "for forbes_list in lists:\n",
+    "    response = requests.get(url, params=forbes_list)\n",
+    "\n",
+    "    if not SOURCES_DIR.exists():\n",
+    "        SOURCES_DIR.mkdir(exist_ok=True, parents=True)\n",
+    "\n",
+    "    DataFrame(response.json()).to_csv('forbes-{}.csv'.format(forbes_list['uri']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then Forbes Global 2000 for the largest 2000 firms globally."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "headers = {\n",
+    "    \"accept\": \"application/json, text/plain, */*\",\n",
+    "    \"referer\": \"https://www.forbes.com/global2000/\",\n",
+    "    \"user-agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36\",\n",
+    "}\n",
+    "\n",
+    "cookies = {\n",
+    "    \"notice_behavior\": \"expressed,eu\",\n",
+    "    \"notice_gdpr_prefs\": \"0,1,2:1a8b5228dd7ff0717196863a5d28ce6c\",\n",
+    "}\n",
+    "\n",
+    "api_url = \"https://www.forbes.com/forbesapi/org/global2000/2020/position/true.json?limit=2000\"\n",
+    "response = requests.get(api_url, headers=headers, cookies=cookies).json()\n",
+    "\n",
+    "sample_table = [\n",
+    "    [\n",
+    "        item[\"organizationName\"],\n",
+    "        item[\"country\"],\n",
+    "        item[\"revenue\"],\n",
+    "        item[\"profits\"],\n",
+    "        item[\"assets\"],\n",
+    "        item[\"marketValue\"]\n",
+    "    ] for item in\n",
+    "    sorted(response[\"organizationList\"][\"organizationsLists\"], key=lambda k: k[\"position\"])\n",
+    "]\n",
+    "\n",
+    "dfff = pd.DataFrame(sample_table, columns=[\"Company\", \"Country\", \"Sales\", \"Profits\", \"Assets\", \"Market Value\"])\n",
+    "dfff.to_csv('forbes-global2000.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}