db295 · db295 · Jul 6, 2021 · Jul 6, 2021 · Jul 6, 2021 · Jul 6, 2021
diff --git a/.gitignore b/.gitignore
@@ -189,5 +189,4 @@ dmypy.json
 
 *.log
 titles.db*
-*.html
 *.png
diff --git a/README.md b/README.md
@@ -1,47 +1,50 @@
 # ILNewsDiff
 
-See feed here: https://twitter.com/ILNewsDiff
+See feed here: <https://twitter.com/ILNewsDiff>
 
 **Interested in contributing to this project? Send us a direct message on [twitter](https://twitter.com/ILNewsDiff)**
 
 A Twitter bot that keeps track of changes made in Israeli news websites.
 
 Currently tracking:
- * [Haaretz](https://www.haaretz.co.il/)
- * [Israel Hayom](https://Israelhayom.co.il/)
- * [Walla](https://www.walla.co.il/)
-
-How does it work?
-------------
+
+* [Haaretz](https://www.haaretz.co.il/)
+* [Israel Hayom](https://Israelhayom.co.il/)
+* [Walla](https://www.walla.co.il/)
+
+## How does it work?
+
 Once a minute the code queries news feeds and compares them to a previous state saved in a local SQLite DB.
 
 If an _interesting_ change is found, a tweet is published with the diff.
 
-The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order. 
+The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order.
 
 ### What is _interesting_?
+
 A change that
- * Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change.
- * Is not comprised of only whitespace or punctuation. 
- * Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting)
-
 
-Installation
-------------
-+ The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file.
-+ `pip install -r requirements.txt`
+* Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change.
+* Is not comprised of only whitespace or punctuation.
+* Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting)
+
+## Installation
+
+* The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file.
+* `pip install -r requirements.txt`
 
 [Twitter keys](https://dev.twitter.com/) are needed.
 
-Credits
--------
+## Credits
+
 For contributing to this repo:
+
 * [@yuvalpinter](https://github.com/yuvalpinter)
 
 Based on @j-e-d's code for the Twitter bot [@nyt_diff](https://twitter.com/nyt_diff).  
 RSS feed fetching added for @xuv's Twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff)
 
-+ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/
-+ RSS fetching: @xuv Julien Deswaef http://xuv.be
-+ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
-+ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
+* Original script and idea: @j-e-d Juan E.D. <http://unahormiga.com/>
+* RSS fetching: @xuv Julien Deswaef <http://xuv.be>
+* Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
+* Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
diff --git a/data_provider.py b/data_provider.py
@@ -5,8 +5,8 @@
 
 
 class DataProvider:
-    def __init__(self):
-        self.db = dataset.connect('sqlite:///titles.db')
+    def __init__(self, db_path="titles.db"):
+        self.db = dataset.connect(f'sqlite:///{db_path}')
         self.articles_table = self.db['rss_ids']
         self.versions_table = self.db['rss_versions']
 
@@ -25,9 +25,17 @@ def track_article(self, data: dict):
         self.versions_table.insert(data)
         logging.info(f"New article tracked: {data['url']}")
 
-    def get_article_version_count(self, artice_id: str, article_source: str, hash: str):
+    def get_article(self, article_id):
+        return self.articles_table.find_one(id=article_id)
+
+    def get_article_version_count_ex(self, article_id: str, article_source: str):
+        return self.versions_table.count(
+            self.versions_table.table.columns.article_id == article_id,
+            article_source=article_source)
+
+    def get_article_version_count(self, article_id: str, article_source: str, hash: str):
         return self.versions_table.count(
-                self.versions_table.table.columns.article_id == artice_id,
+            self.versions_table.table.columns.article_id == article_id,
                 article_source=article_source,
                 hash=hash)
 

diff --git a/css/styles.css → image_generator/css/styles.css b/css/styles.css → image_generator/css/styles.css
@@ -6,7 +6,7 @@
 }
 
 body {
-    background: lightgray url('../img/paper_fibers.png') repeat;
+    background: lightgray url('../base_image/paper_fibers.png') repeat;
     font-family: Merriweather;
     font-size: 16px;
     direction: rtl;

diff --git a/fonts/Merriweather-Regular.ttf → ..._generator/fonts/Merriweather-Regular.ttf b/fonts/Merriweather-Regular.ttf → ..._generator/fonts/Merriweather-Regular.ttf
diff --git a/image_diff_generator.py → image_generator/image_diff_generator.py b/image_diff_generator.py → image_generator/image_diff_generator.py
@@ -66,14 +66,12 @@ def generate_image_diff(old: str, new: str, text_to_tweet: str):
         img = Image.open('./tmp.png')
         img2 = img.crop((0, 0, total_width, total_height))
         if int(total_width) > int(total_height * 2):
-            background = Image.new('RGBA', (total_width, int(total_width / 2)),
-                                   (255, 255, 255, 0))
+            background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0))
             bg_w, bg_h = background.size
             offset = (int((bg_w - total_width) / 2),
                       int((bg_h - total_height) / 2))
         else:
-            background = Image.new('RGBA', (total_width, total_height),
-                                   (255, 255, 255, 0))
+            background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0))
             bg_w, bg_h = background.size
             offset = (int((bg_w - total_width) / 2),
                       int((bg_h - total_height) / 2))

diff --git a/template.html → image_generator/template.html b/template.html → image_generator/template.html
diff --git a/img/paper_fibers.png b/img/paper_fibers.png
diff --git a/img/twitter.png b/img/twitter.png
diff --git a/loggers.py b/loggers.py
@@ -0,0 +1,27 @@
+import logging
+import os
+import sys
+
+if 'LOG_FOLDER' in os.environ:
+    LOG_FOLDER = os.environ['LOG_FOLDER']
+else:
+    LOG_FOLDER = ''
+
+
+def setup_loggers():
+    # Add Handlers 
+    logging_filehandler = logging.FileHandler(filename=os.path.join(LOG_FOLDER, 'titlediff.log'), encoding='utf-8', mode='a+')
+    handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)]
+    logging.basicConfig(handlers=handlers,
+                        format='%(asctime)s %(name)13s %(levelname)8s: %(message)s',
+                        level=logging.DEBUG)
+
+    # Handle some loggers
+    logging.getLogger("requests").setLevel(logging.WARNING)
+    logging.getLogger("selenium").setLevel(logging.WARNING)
+    logging.getLogger("PIL").setLevel(logging.WARNING)
+    logging.getLogger("requests_oauthlib").setLevel(logging.WARNING)
+    logging.getLogger("chardet").setLevel(logging.WARNING)
+    logging.getLogger("tweepy").setLevel(logging.WARNING)
+    logging.getLogger("oauthlib").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
diff --git a/main.py b/main.py
@@ -1,55 +1,29 @@
 #!/usr/bin/python3
 
 import logging
-import os
-import sys
-
 from pytz import timezone
 
 from parsers.haaretz_parser import HaaretzParser
 from parsers.israel_hayom_parser import IsraelHayomParser
 from parsers.walla_parser import WallaParser
+from loggers import setup_loggers
 
 TIMEZONE = 'Israel'
 LOCAL_TZ = timezone(TIMEZONE)
-
-if 'LOG_FOLDER' in os.environ:
-    LOG_FOLDER = os.environ['LOG_FOLDER']
-else:
-    LOG_FOLDER = ''
+PARSER_CLASSES = [HaaretzParser, IsraelHayomParser, WallaParser]
 
 
 def main():
-    # logging
-    logging_filehandler = logging.FileHandler(filename=LOG_FOLDER + 'titlediff.log',
-                                              encoding='utf-8', mode='a+')
-    handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)]
-    logging.basicConfig(handlers=handlers,
-                        format='%(asctime)s %(name)13s %(levelname)8s: %(message)s',
-                        level=logging.DEBUG)
-
-    logging.getLogger("requests").setLevel(logging.WARNING)
-    logging.getLogger("selenium").setLevel(logging.WARNING)
-    logging.getLogger("PIL").setLevel(logging.WARNING)
-    logging.getLogger("requests_oauthlib").setLevel(logging.WARNING)
-    logging.getLogger("chardet").setLevel(logging.WARNING)
-    logging.getLogger("tweepy").setLevel(logging.WARNING)
-    logging.getLogger("oauthlib").setLevel(logging.WARNING)
-    logging.getLogger("urllib3").setLevel(logging.WARNING)
-
+    setup_loggers()
     logging.info('Starting script')
 
-    try:
-        logging.debug('Starting Parsers')
-        parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)]
-        for parser in parsers:
-            logging.info(f"Parsing {parser.get_source()}")
-            parser.parse()
-        logging.debug('Finished')
-    except Exception:
-        logging.exception('Parser')
-
-    logging.info('Finished script')
+    logging.debug('Starting Parsers')
+    parsers = [parser_class(LOCAL_TZ) for parser_class in PARSER_CLASSES]
+    for parser in parsers:
+        logging.info(f"Parsing {parser.get_source()}")
+        parser.parse()
+
+    logging.debug('Finished')
 
 
 if __name__ == "__main__":

diff --git a/output/.gitignore b/output/.gitignore
diff --git a/base_parser.py → parsers/base_parser.py b/base_parser.py → parsers/base_parser.py
@@ -5,7 +5,7 @@
 
 from data_provider import DataProvider
 from twitter_helper import upload_media, tweet_text, tweet_with_media
-from image_diff_generator import ImageDiffGenerator
+from image_generator.image_diff_generator import ImageDiffGenerator
 
 if 'TESTING' in os.environ:
     if os.environ['TESTING'] == 'False':
@@ -71,8 +71,7 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str):
 
     def store_data(self, data: Dict):
         if self.data_provider.is_article_tracked(data['article_id'], self.get_source()):
-            count = self.data_provider.get_article_version_count(data[
-                                                                     'article_id'], self.get_source(), data['hash'])
+            count = self.data_provider.get_article_version_count(data['article_id'], self.get_source(), data['hash'])
             if count != 1:  # Changed
                 self.tweet_all_changes(data)
         else:
@@ -121,6 +120,7 @@ def loop_entries(self, entries):
                     articles[article_dict['article_id']] = article_dict
             except BaseException:
                 logging.exception(f'Problem looping entry: {article}')
+
         for article_dict in articles.values():
             try:
                 self.store_data(article_dict)

diff --git a/parsers/parser_utils.py b/parsers/parser_utils.py
@@ -16,6 +16,7 @@ def standard_entry_to_dict(article, source, tz, strip_description=False):
     else:
         article_dict['abstract'] = article['description']
     od = collections.OrderedDict(sorted(article_dict.items()))
+    # The bug of the key??
     article_dict['hash'] = hashlib.sha224(repr(od.items()).encode('utf-8')).hexdigest()
     article_dict['date_time'] = datetime.now(tz)
     return article_dict
diff --git a/rss_parser.py → parsers/rss_parser.py b/rss_parser.py → parsers/rss_parser.py
@@ -1,5 +1,4 @@
 import logging
-
 import feedparser
 
 from base_parser import BaseParser
@@ -16,6 +15,6 @@ def parse(self):
         if r is None:
             logging.warning('Empty response RSS')
             return
-        else:
-            logging.info('Parsing %s', r.channel.title)
+
+        logging.info('Parsing %s', r.channel.title)
         self.loop_entries(r.entries)
diff --git a/process_data/csv_data_provider.py b/process_data/csv_data_provider.py
@@ -0,0 +1,11 @@
+import os
+import pandas as pd
+
+
+class CsvDataProvider:
+    def __init__(self, data_files=r"../csvs", version=0):
+        self._data_files_dir = data_files
+        self.articles = pd.read_csv(os.path.join(data_files, "articles.csv"))
+        self.versions_file_format = os.path.join(data_files, "versions_{version}.csv")
+        self.versions = pd.read_csv(self.versions_file_format.format(version=version))
+
diff --git a/process_data/feature_extractor.py b/process_data/feature_extractor.py
@@ -0,0 +1,12 @@
+
+class FeatureExtractor(object):
+    def __init__(self) -> None:
+        pass
+
+    @staticmethod
+    def get_cols() -> list:
+        return []
+
+    @staticmethod
+    def extract(data, previous_data, article):
+        pass
diff --git a/process_data/printer_extractor.py b/process_data/printer_extractor.py
@@ -0,0 +1,8 @@
+from feature_extractor import FeatureExtractor
+
+
+class PrinterExtractor(FeatureExtractor):
+    @staticmethod
+    def extract(data, previous_data, article):
+        print(data)
+        return []
diff --git a/process_data/process_data.py b/process_data/process_data.py
@@ -0,0 +1,77 @@
+import argparse
+import itertools
+
+import pandas as pd
+
+from printer_extractor import PrinterExtractor
+from word_token_extractor import WordsTokensExtractor
+from csv_data_provider import CsvDataProvider
+
+COLUMNS_TO_KEEP = ["id", "version", "article_id", "article_source", "title", "amount_of_words"]
+
+ROW_EXTRACTED = {
+    # "amount_of_words": WordsTokensExtractor.how_many_words
+}
+
+FEATURE_EXTRACTORS = [
+    # WordsTokensExtractor,
+    # PrinterExtractor
+]
+
+
+def process_data(data_files, clean_csv, out_version, in_version=0):
+    # TODO: choose which features to extract
+    dt = CsvDataProvider(data_files, in_version)
+
+    if clean_csv:
+        # The data we want to keep from the csvs
+        extracted_features = dt.versions[COLUMNS_TO_KEEP]
+    else:
+        extracted_features = dt.versions
+
+    # cols based on the same row data -
+    for col_name, f in ROW_EXTRACTED.items():
+        extracted_features[col_name] = dt.versions.apply(f, axis=1, result_type="expand")
+
+    if len(FEATURE_EXTRACTORS) > 0:
+        # cols based on a lot of data -
+        cols = list(itertools.chain.from_iterable([extractor.get_cols() for extractor in FEATURE_EXTRACTORS]))
+        processed_data = pd.DataFrame(columns=cols)
+        row_id = 0
+
+        # Extract Features
+        for _id, article in dt.articles.iterrows():
+            article_versions = dt.versions[(dt.versions["article_id"] == article["article_id"]) &
+                                           (dt.versions["article_source"] == article["article_source"])]
+
+            for __id, single_version in article_versions.iterrows():
+                past_versions = article_versions[article_versions["version"] < single_version["version"]]
+                processed_row = []
+                for feature_extractor in FEATURE_EXTRACTORS:
+                    processed_row.extend(feature_extractor.extract(single_version, past_versions, article))
+
+                processed_data.loc[row_id] = processed_row
+                row_id += 1
+
+    # TODO: export back to csv
+    print(f"Max words - {extracted_features['amount_of_words'].max()}")
+    out_file = dt.versions_file_format.format(version=out_version)
+    extracted_features.to_csv(out_file)
+
+    print("Done!!")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Process some data.')
+    parser.add_argument('datafiles', help='A path to the folders with the csvs')
+    parser.add_argument('--in-version', default=0, help='Which version to use')
+    parser.add_argument('--out-version', help='Which version to write')
+
+    parser.add_argument('--clean', action="store_true", help='Should keep only important data')
+
+    args = parser.parse_args()
+    process_data(args.datafiles, args.clean, args.out_version, args.in_version)
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -189,5 +189,4 @@ dmypy.json @@
     *.log
     titles.db*
-    *.html
     *.png