diff --git a/.gitignore b/.gitignore index a56a5f2840..e02e100c17 100644 --- a/.gitignore +++ b/.gitignore @@ -189,5 +189,4 @@ dmypy.json *.log titles.db* -*.html *.png diff --git a/README.md b/README.md index 6ac434fc71..5ec6cf743d 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,50 @@ # ILNewsDiff -See feed here: https://twitter.com/ILNewsDiff +See feed here: **Interested in contributing to this project? Send us a direct message on [twitter](https://twitter.com/ILNewsDiff)** A Twitter bot that keeps track of changes made in Israeli news websites. Currently tracking: - * [Haaretz](https://www.haaretz.co.il/) - * [Israel Hayom](https://Israelhayom.co.il/) - * [Walla](https://www.walla.co.il/) - -How does it work? ------------- + +* [Haaretz](https://www.haaretz.co.il/) +* [Israel Hayom](https://Israelhayom.co.il/) +* [Walla](https://www.walla.co.il/) + +## How does it work? + Once a minute the code queries news feeds and compares them to a previous state saved in a local SQLite DB. If an _interesting_ change is found, a tweet is published with the diff. -The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order. +The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order. ### What is _interesting_? + A change that - * Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change. - * Is not comprised of only whitespace or punctuation. - * Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting) - -Installation ------------- -+ The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file. -+ `pip install -r requirements.txt` +* Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change. +* Is not comprised of only whitespace or punctuation. +* Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting) + +## Installation + +* The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file. +* `pip install -r requirements.txt` [Twitter keys](https://dev.twitter.com/) are needed. -Credits -------- +## Credits + For contributing to this repo: + * [@yuvalpinter](https://github.com/yuvalpinter) Based on @j-e-d's code for the Twitter bot [@nyt_diff](https://twitter.com/nyt_diff). RSS feed fetching added for @xuv's Twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff) -+ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/ -+ RSS fetching: @xuv Julien Deswaef http://xuv.be -+ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather) -+ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). +* Original script and idea: @j-e-d Juan E.D. +* RSS fetching: @xuv Julien Deswaef +* Font: [Merriweather](https://fonts.google.com/specimen/Merriweather) +* Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). diff --git a/data_provider.py b/data_provider.py index 8838fb6313..f282ef4c64 100644 --- a/data_provider.py +++ b/data_provider.py @@ -5,8 +5,8 @@ class DataProvider: - def __init__(self): - self.db = dataset.connect('sqlite:///titles.db') + def __init__(self, db_path="titles.db"): + self.db = dataset.connect(f'sqlite:///{db_path}') self.articles_table = self.db['rss_ids'] self.versions_table = self.db['rss_versions'] @@ -25,9 +25,17 @@ def track_article(self, data: dict): self.versions_table.insert(data) logging.info(f"New article tracked: {data['url']}") - def get_article_version_count(self, artice_id: str, article_source: str, hash: str): + def get_article(self, article_id): + return self.articles_table.find_one(id=article_id) + + def get_article_version_count_ex(self, article_id: str, article_source: str): + return self.versions_table.count( + self.versions_table.table.columns.article_id == article_id, + article_source=article_source) + + def get_article_version_count(self, article_id: str, article_source: str, hash: str): return self.versions_table.count( - self.versions_table.table.columns.article_id == artice_id, + self.versions_table.table.columns.article_id == article_id, article_source=article_source, hash=hash) diff --git a/css/styles.css b/image_generator/css/styles.css similarity index 92% rename from css/styles.css rename to image_generator/css/styles.css index 90bb0f067e..b7db6c44be 100755 --- a/css/styles.css +++ b/image_generator/css/styles.css @@ -6,7 +6,7 @@ } body { - background: lightgray url('../img/paper_fibers.png') repeat; + background: lightgray url('../base_image/paper_fibers.png') repeat; font-family: Merriweather; font-size: 16px; direction: rtl; diff --git a/fonts/Merriweather-Regular.ttf b/image_generator/fonts/Merriweather-Regular.ttf similarity index 100% rename from fonts/Merriweather-Regular.ttf rename to image_generator/fonts/Merriweather-Regular.ttf diff --git a/image_diff_generator.py b/image_generator/image_diff_generator.py similarity index 95% rename from image_diff_generator.py rename to image_generator/image_diff_generator.py index baeb24b9cc..348c7f7022 100644 --- a/image_diff_generator.py +++ b/image_generator/image_diff_generator.py @@ -66,14 +66,12 @@ def generate_image_diff(old: str, new: str, text_to_tweet: str): img = Image.open('./tmp.png') img2 = img.crop((0, 0, total_width, total_height)) if int(total_width) > int(total_height * 2): - background = Image.new('RGBA', (total_width, int(total_width / 2)), - (255, 255, 255, 0)) + background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int((bg_w - total_width) / 2), int((bg_h - total_height) / 2)) else: - background = Image.new('RGBA', (total_width, total_height), - (255, 255, 255, 0)) + background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0)) bg_w, bg_h = background.size offset = (int((bg_w - total_width) / 2), int((bg_h - total_height) / 2)) diff --git a/template.html b/image_generator/template.html similarity index 100% rename from template.html rename to image_generator/template.html diff --git a/img/paper_fibers.png b/img/paper_fibers.png deleted file mode 100755 index 2d93493f86..0000000000 Binary files a/img/paper_fibers.png and /dev/null differ diff --git a/img/twitter.png b/img/twitter.png deleted file mode 100644 index c9b8935797..0000000000 Binary files a/img/twitter.png and /dev/null differ diff --git a/loggers.py b/loggers.py new file mode 100644 index 0000000000..aea9f42ec3 --- /dev/null +++ b/loggers.py @@ -0,0 +1,27 @@ +import logging +import os +import sys + +if 'LOG_FOLDER' in os.environ: + LOG_FOLDER = os.environ['LOG_FOLDER'] +else: + LOG_FOLDER = '' + + +def setup_loggers(): + # Add Handlers + logging_filehandler = logging.FileHandler(filename=os.path.join(LOG_FOLDER, 'titlediff.log'), encoding='utf-8', mode='a+') + handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)] + logging.basicConfig(handlers=handlers, + format='%(asctime)s %(name)13s %(levelname)8s: %(message)s', + level=logging.DEBUG) + + # Handle some loggers + logging.getLogger("requests").setLevel(logging.WARNING) + logging.getLogger("selenium").setLevel(logging.WARNING) + logging.getLogger("PIL").setLevel(logging.WARNING) + logging.getLogger("requests_oauthlib").setLevel(logging.WARNING) + logging.getLogger("chardet").setLevel(logging.WARNING) + logging.getLogger("tweepy").setLevel(logging.WARNING) + logging.getLogger("oauthlib").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) diff --git a/main.py b/main.py index 19dddebc1a..a7e254678d 100644 --- a/main.py +++ b/main.py @@ -1,55 +1,29 @@ #!/usr/bin/python3 import logging -import os -import sys - from pytz import timezone from parsers.haaretz_parser import HaaretzParser from parsers.israel_hayom_parser import IsraelHayomParser from parsers.walla_parser import WallaParser +from loggers import setup_loggers TIMEZONE = 'Israel' LOCAL_TZ = timezone(TIMEZONE) - -if 'LOG_FOLDER' in os.environ: - LOG_FOLDER = os.environ['LOG_FOLDER'] -else: - LOG_FOLDER = '' +PARSER_CLASSES = [HaaretzParser, IsraelHayomParser, WallaParser] def main(): - # logging - logging_filehandler = logging.FileHandler(filename=LOG_FOLDER + 'titlediff.log', - encoding='utf-8', mode='a+') - handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)] - logging.basicConfig(handlers=handlers, - format='%(asctime)s %(name)13s %(levelname)8s: %(message)s', - level=logging.DEBUG) - - logging.getLogger("requests").setLevel(logging.WARNING) - logging.getLogger("selenium").setLevel(logging.WARNING) - logging.getLogger("PIL").setLevel(logging.WARNING) - logging.getLogger("requests_oauthlib").setLevel(logging.WARNING) - logging.getLogger("chardet").setLevel(logging.WARNING) - logging.getLogger("tweepy").setLevel(logging.WARNING) - logging.getLogger("oauthlib").setLevel(logging.WARNING) - logging.getLogger("urllib3").setLevel(logging.WARNING) - + setup_loggers() logging.info('Starting script') - try: - logging.debug('Starting Parsers') - parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] - for parser in parsers: - logging.info(f"Parsing {parser.get_source()}") - parser.parse() - logging.debug('Finished') - except Exception: - logging.exception('Parser') - - logging.info('Finished script') + logging.debug('Starting Parsers') + parsers = [parser_class(LOCAL_TZ) for parser_class in PARSER_CLASSES] + for parser in parsers: + logging.info(f"Parsing {parser.get_source()}") + parser.parse() + + logging.debug('Finished') if __name__ == "__main__": diff --git a/output/.gitignore b/output/.gitignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/base_parser.py b/parsers/base_parser.py similarity index 96% rename from base_parser.py rename to parsers/base_parser.py index 15284b9d8f..0697505e72 100644 --- a/base_parser.py +++ b/parsers/base_parser.py @@ -5,7 +5,7 @@ from data_provider import DataProvider from twitter_helper import upload_media, tweet_text, tweet_with_media -from image_diff_generator import ImageDiffGenerator +from image_generator.image_diff_generator import ImageDiffGenerator if 'TESTING' in os.environ: if os.environ['TESTING'] == 'False': @@ -71,8 +71,7 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str): def store_data(self, data: Dict): if self.data_provider.is_article_tracked(data['article_id'], self.get_source()): - count = self.data_provider.get_article_version_count(data[ - 'article_id'], self.get_source(), data['hash']) + count = self.data_provider.get_article_version_count(data['article_id'], self.get_source(), data['hash']) if count != 1: # Changed self.tweet_all_changes(data) else: @@ -121,6 +120,7 @@ def loop_entries(self, entries): articles[article_dict['article_id']] = article_dict except BaseException: logging.exception(f'Problem looping entry: {article}') + for article_dict in articles.values(): try: self.store_data(article_dict) diff --git a/parsers/parser_utils.py b/parsers/parser_utils.py index 931da87ac1..dc6e1a1b1e 100644 --- a/parsers/parser_utils.py +++ b/parsers/parser_utils.py @@ -16,6 +16,7 @@ def standard_entry_to_dict(article, source, tz, strip_description=False): else: article_dict['abstract'] = article['description'] od = collections.OrderedDict(sorted(article_dict.items())) + # The bug of the key?? article_dict['hash'] = hashlib.sha224(repr(od.items()).encode('utf-8')).hexdigest() article_dict['date_time'] = datetime.now(tz) return article_dict diff --git a/rss_parser.py b/parsers/rss_parser.py similarity index 84% rename from rss_parser.py rename to parsers/rss_parser.py index f473e844ef..eb3a7c156e 100644 --- a/rss_parser.py +++ b/parsers/rss_parser.py @@ -1,5 +1,4 @@ import logging - import feedparser from base_parser import BaseParser @@ -16,6 +15,6 @@ def parse(self): if r is None: logging.warning('Empty response RSS') return - else: - logging.info('Parsing %s', r.channel.title) + + logging.info('Parsing %s', r.channel.title) self.loop_entries(r.entries) diff --git a/process_data/csv_data_provider.py b/process_data/csv_data_provider.py new file mode 100644 index 0000000000..ac76020ce1 --- /dev/null +++ b/process_data/csv_data_provider.py @@ -0,0 +1,11 @@ +import os +import pandas as pd + + +class CsvDataProvider: + def __init__(self, data_files=r"../csvs", version=0): + self._data_files_dir = data_files + self.articles = pd.read_csv(os.path.join(data_files, "articles.csv")) + self.versions_file_format = os.path.join(data_files, "versions_{version}.csv") + self.versions = pd.read_csv(self.versions_file_format.format(version=version)) + diff --git a/process_data/feature_extractor.py b/process_data/feature_extractor.py new file mode 100644 index 0000000000..b5582fb334 --- /dev/null +++ b/process_data/feature_extractor.py @@ -0,0 +1,12 @@ + +class FeatureExtractor(object): + def __init__(self) -> None: + pass + + @staticmethod + def get_cols() -> list: + return [] + + @staticmethod + def extract(data, previous_data, article): + pass diff --git a/process_data/printer_extractor.py b/process_data/printer_extractor.py new file mode 100644 index 0000000000..7c8de25094 --- /dev/null +++ b/process_data/printer_extractor.py @@ -0,0 +1,8 @@ +from feature_extractor import FeatureExtractor + + +class PrinterExtractor(FeatureExtractor): + @staticmethod + def extract(data, previous_data, article): + print(data) + return [] diff --git a/process_data/process_data.py b/process_data/process_data.py new file mode 100644 index 0000000000..08b3e4963c --- /dev/null +++ b/process_data/process_data.py @@ -0,0 +1,77 @@ +import argparse +import itertools + +import pandas as pd + +from printer_extractor import PrinterExtractor +from word_token_extractor import WordsTokensExtractor +from csv_data_provider import CsvDataProvider + +COLUMNS_TO_KEEP = ["id", "version", "article_id", "article_source", "title", "amount_of_words"] + +ROW_EXTRACTED = { + # "amount_of_words": WordsTokensExtractor.how_many_words +} + +FEATURE_EXTRACTORS = [ + # WordsTokensExtractor, + # PrinterExtractor +] + + +def process_data(data_files, clean_csv, out_version, in_version=0): + # TODO: choose which features to extract + dt = CsvDataProvider(data_files, in_version) + + if clean_csv: + # The data we want to keep from the csvs + extracted_features = dt.versions[COLUMNS_TO_KEEP] + else: + extracted_features = dt.versions + + # cols based on the same row data - + for col_name, f in ROW_EXTRACTED.items(): + extracted_features[col_name] = dt.versions.apply(f, axis=1, result_type="expand") + + if len(FEATURE_EXTRACTORS) > 0: + # cols based on a lot of data - + cols = list(itertools.chain.from_iterable([extractor.get_cols() for extractor in FEATURE_EXTRACTORS])) + processed_data = pd.DataFrame(columns=cols) + row_id = 0 + + # Extract Features + for _id, article in dt.articles.iterrows(): + article_versions = dt.versions[(dt.versions["article_id"] == article["article_id"]) & + (dt.versions["article_source"] == article["article_source"])] + + for __id, single_version in article_versions.iterrows(): + past_versions = article_versions[article_versions["version"] < single_version["version"]] + processed_row = [] + for feature_extractor in FEATURE_EXTRACTORS: + processed_row.extend(feature_extractor.extract(single_version, past_versions, article)) + + processed_data.loc[row_id] = processed_row + row_id += 1 + + # TODO: export back to csv + print(f"Max words - {extracted_features['amount_of_words'].max()}") + out_file = dt.versions_file_format.format(version=out_version) + extracted_features.to_csv(out_file) + + print("Done!!") + + +def main(): + parser = argparse.ArgumentParser(description='Process some data.') + parser.add_argument('datafiles', help='A path to the folders with the csvs') + parser.add_argument('--in-version', default=0, help='Which version to use') + parser.add_argument('--out-version', help='Which version to write') + + parser.add_argument('--clean', action="store_true", help='Should keep only important data') + + args = parser.parse_args() + process_data(args.datafiles, args.clean, args.out_version, args.in_version) + + +if __name__ == "__main__": + main() diff --git a/process_data/word_token_extractor.py b/process_data/word_token_extractor.py new file mode 100644 index 0000000000..4724ce26b6 --- /dev/null +++ b/process_data/word_token_extractor.py @@ -0,0 +1,16 @@ +from feature_extractor import FeatureExtractor + + +class WordsTokensExtractor(FeatureExtractor): + @staticmethod + def get_cols() -> list: + return ["amount_of_words", "title"] + + @staticmethod + def how_many_words(data): + return data.title.count(" ") + 1 + + @staticmethod + def extract(data, previous_data, article): + amount_of_words = WordsTokensExtractor.how_many_words(data) + return [amount_of_words, data.title] diff --git a/requirements.txt b/requirements.txt index 20d491402b..559fe52048 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ six==1.10.0 SQLAlchemy==1.3.6 tweepy==3.10.0 beautifulsoup4==4.9.3 -flake8==3.8.4 \ No newline at end of file +flake8==3.8.4 +pandas \ No newline at end of file