From 23f73de9dd52e2d2fb8aee7de847439e3e710af1 Mon Sep 17 00:00:00 2001 From: Alon Date: Wed, 3 Feb 2021 19:44:20 +0200 Subject: [PATCH 1/3] Added Walla parser --- israel_hayom_parser.py | 38 ------------------- main.py | 7 ++-- .../haaretz_parser.py | 17 +-------- parsers/israel_hayom_parser.py | 25 ++++++++++++ parsers/parser_utils.py | 21 ++++++++++ parsers/walla_parser.py | 28 ++++++++++++++ scripts/validate_parser.py | 2 +- 7 files changed, 81 insertions(+), 57 deletions(-) delete mode 100644 israel_hayom_parser.py rename haaretz_parser.py => parsers/haaretz_parser.py (50%) create mode 100644 parsers/israel_hayom_parser.py create mode 100644 parsers/parser_utils.py create mode 100644 parsers/walla_parser.py diff --git a/israel_hayom_parser.py b/israel_hayom_parser.py deleted file mode 100644 index 494b85abdf..0000000000 --- a/israel_hayom_parser.py +++ /dev/null @@ -1,38 +0,0 @@ -import collections -import hashlib -from datetime import datetime - -import validators.content_validator -from rss_parser import RSSParser - -ISRAEL_HAYOM_RSS = "https://www.israelhayom.co.il/rss.xml" - - -class IsraelHayomParser(RSSParser): - def __init__(self, tz): - RSSParser.__init__(self, ISRAEL_HAYOM_RSS) - self.tz = tz - - @staticmethod - def get_source(): - return "IsraelHayom" - - def should_use_first_item_dedup(self): - return True - - def get_tweet_validators(self): - return [validators.content_validator] - - def entry_to_dict(self, article): - article_dict = dict() - article_dict['article_id'] = article.guid - article_dict['article_source'] = self.get_source() - article_dict['url'] = article.link - article_dict['title'] = article.title - article_dict['abstract'] = article['description'] - od = collections.OrderedDict(sorted(article_dict.items())) - article_dict['hash'] = hashlib.sha224( - repr(od.items()).encode('utf-8')).hexdigest() - article_dict['date_time'] = datetime.now(self.tz) - return article_dict - diff --git a/main.py b/main.py index 8b0910660b..78d2acfeb6 100644 --- a/main.py +++ b/main.py @@ -6,8 +6,9 @@ from pytz import timezone -from haaretz_parser import HaaretzParser -from israel_hayom_parser import IsraelHayomParser +from parsers.haaretz_parser import HaaretzParser +from parsers.israel_hayom_parser import IsraelHayomParser +from parsers.walla_parser import WallaParser TIMEZONE = 'Israel' LOCAL_TZ = timezone(TIMEZONE) @@ -38,7 +39,7 @@ def main(): try: logging.debug('Starting Parsers') - parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ)] + parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] for parser in parsers: logging.info(f"Parsing {parser.get_source()}") parser.parse() diff --git a/haaretz_parser.py b/parsers/haaretz_parser.py similarity index 50% rename from haaretz_parser.py rename to parsers/haaretz_parser.py index 1a4ffeac56..0f2df178ae 100644 --- a/haaretz_parser.py +++ b/parsers/haaretz_parser.py @@ -1,9 +1,6 @@ -import collections -import hashlib -from datetime import datetime - import validators.html_validator import validators.content_validator +from parsers import parser_utils from rss_parser import RSSParser HAARETZ_RSS = "https://www.haaretz.co.il/cmlink/1.1617539" @@ -28,14 +25,4 @@ def get_tweet_validators(self): return [validators.content_validator] def entry_to_dict(self, article): - article_dict = dict() - article_dict['article_id'] = article.guid - article_dict['article_source'] = self.get_source() - article_dict['url'] = article.link - article_dict['title'] = article.title - article_dict['abstract'] = article['description'] - od = collections.OrderedDict(sorted(article_dict.items())) - article_dict['hash'] = hashlib.sha224( - repr(od.items()).encode('utf-8')).hexdigest() - article_dict['date_time'] = datetime.now(self.tz) - return article_dict + return parser_utils.standard_entry_to_dict(article, self.get_source(), self.tz) diff --git a/parsers/israel_hayom_parser.py b/parsers/israel_hayom_parser.py new file mode 100644 index 0000000000..8e558218f0 --- /dev/null +++ b/parsers/israel_hayom_parser.py @@ -0,0 +1,25 @@ +import validators.content_validator +from parsers import parser_utils +from rss_parser import RSSParser + +ISRAEL_HAYOM_RSS = "https://www.israelhayom.co.il/rss.xml" + + +class IsraelHayomParser(RSSParser): + def __init__(self, tz): + RSSParser.__init__(self, ISRAEL_HAYOM_RSS) + self.tz = tz + + @staticmethod + def get_source(): + return "IsraelHayom" + + def should_use_first_item_dedup(self): + return True + + def get_tweet_validators(self): + return [validators.content_validator] + + def entry_to_dict(self, article): + return parser_utils.standard_entry_to_dict(article, self.get_source(), self.tz) + diff --git a/parsers/parser_utils.py b/parsers/parser_utils.py new file mode 100644 index 0000000000..931da87ac1 --- /dev/null +++ b/parsers/parser_utils.py @@ -0,0 +1,21 @@ +import collections +import hashlib +from datetime import datetime + +from html_utils import strip_html + + +def standard_entry_to_dict(article, source, tz, strip_description=False): + article_dict = dict() + article_dict['article_id'] = article.guid + article_dict['article_source'] = source + article_dict['url'] = article.link + article_dict['title'] = article.title + if strip_description: + article_dict['abstract'] = strip_html(article['description']) + else: + article_dict['abstract'] = article['description'] + od = collections.OrderedDict(sorted(article_dict.items())) + article_dict['hash'] = hashlib.sha224(repr(od.items()).encode('utf-8')).hexdigest() + article_dict['date_time'] = datetime.now(tz) + return article_dict diff --git a/parsers/walla_parser.py b/parsers/walla_parser.py new file mode 100644 index 0000000000..404059fb93 --- /dev/null +++ b/parsers/walla_parser.py @@ -0,0 +1,28 @@ +import validators.html_validator +import validators.content_validator +from parsers import parser_utils +from rss_parser import RSSParser + +WALLA_RSS = "https://rss.walla.co.il/feed/1?type=main" + + +class WallaParser(RSSParser): + def __init__(self, tz): + RSSParser.__init__(self, WALLA_RSS) + self.tz = tz + + @staticmethod + def get_source(): + return "Walla" + + def should_use_first_item_dedup(self): + return True + + def get_integrity_validators(self): + return [validators.html_validator] + + def get_tweet_validators(self): + return [validators.content_validator] + + def entry_to_dict(self, article): + return parser_utils.standard_entry_to_dict(article, self.get_source(), self.tz, strip_description=True) diff --git a/scripts/validate_parser.py b/scripts/validate_parser.py index 7c1f4333e5..f7b06da6a9 100644 --- a/scripts/validate_parser.py +++ b/scripts/validate_parser.py @@ -3,7 +3,7 @@ import feedparser from validators import html_validator -from israel_hayom_parser import IsraelHayomParser as Parser +from parsers.walla_parser import WallaParser as Parser TIMEZONE = 'Israel' LOCAL_TZ = timezone(TIMEZONE) From 88bd54e360d57b6ad5cd5f6a092926c511499f5d Mon Sep 17 00:00:00 2001 From: Alon Date: Fri, 5 Feb 2021 10:52:06 +0200 Subject: [PATCH 2/3] Track image --- base_parser.py | 18 ++++++++++---- data_provider.py | 12 +++++----- image_diff_generator.py | 31 +++++++++++++++++------- image_template.html | 37 +++++++++++++++++++++++++++++ main.py | 2 +- parsers/haaretz_parser.py | 7 +++++- template.html => text_template.html | 0 7 files changed, 86 insertions(+), 21 deletions(-) create mode 100644 image_template.html rename template.html => text_template.html (100%) diff --git a/base_parser.py b/base_parser.py index d256df938c..0c2d2e7242 100644 --- a/base_parser.py +++ b/base_parser.py @@ -71,19 +71,22 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str): logging.info(f'Id to store: {tweet_id}') self.data_provider.update_tweet_db(article_id, self.get_source(), tweet_id) - def store_data(self, data: Dict): + def handle_articles(self, data: Dict): if self.data_provider.is_article_tracked(data['article_id'], self.get_source()): - count = self.data_provider.get_article_version_count(data[ - 'article_id'], self.get_source(), data['hash']) + count = self.data_provider.get_article_version_count(data['article_id'], self.get_source(), data['hash']) if count != 1: # Changed self.tweet_all_changes(data) else: self.data_provider.track_article(data) def tweet_change(self, previous_data: str, current_data: str, text_to_tweet: str, article_id: str, url: str): - saved_image_diff_path = ImageDiffGenerator.generate_image_diff(previous_data, current_data, text_to_tweet) + saved_image_diff_path = ImageDiffGenerator.generate_text_diff(previous_data, current_data, text_to_tweet) self.tweet(text_to_tweet, article_id, url, saved_image_diff_path) + def tweet_image_change(self, old_url: str, new_url: str, article_id: str, url: str): + saved_image_diff_path = ImageDiffGenerator.generate_image_diff(old_url, new_url, "שינוי בתמונה") + self.tweet("שינוי בתמונה", article_id, url, saved_image_diff_path) + def tweet_all_changes(self, data: Dict): article_id = data['article_id'] url = data['url'] @@ -101,6 +104,11 @@ def tweet_all_changes(self, data: Dict): if self.should_tweet(url, previous_version['abstract'], data['abstract']): self.tweet_change(previous_version['abstract'], data['abstract'], "שינוי בתת כותרת", article_id, url) + if data["image"] and previous_version["image"] and data["image"] != previous_version["image"]: + # TODO: Add validator if image is up + save_to_db = True + self.tweet_image_change(previous_version["image"], data["image"], article_id, url) + if save_to_db: self.data_provider.increase_article_version(data) @@ -125,6 +133,6 @@ def loop_entries(self, entries): logging.exception(f'Problem looping entry: {article}') for article_dict in articles.values(): try: - self.store_data(article_dict) + self.handle_articles(article_dict) except BaseException as e: logging.exception(f'Problem looping entry: {article_dict}') diff --git a/data_provider.py b/data_provider.py index 40abd40158..20dd18a2e9 100644 --- a/data_provider.py +++ b/data_provider.py @@ -4,8 +4,8 @@ import dataset -class DataProvider(): +class DataProvider: def __init__(self): self.db = dataset.connect('sqlite:///titles.db') self.articles_table = self.db['rss_ids'] @@ -25,12 +25,12 @@ def track_article(self, data: dict): data['version'] = 1 self.versions_table.insert(data) logging.info(f"New article tracked: {data['url']}") - + def get_article_version_count(self, artice_id: str, article_source: str, hash: str): return self.versions_table.count( - self.versions_table.table.columns.article_id == artice_id, - article_source=article_source, - hash=hash) + self.versions_table.table.columns.article_id == artice_id, + article_source=article_source, + hash=hash) def get_previous_article_version(self, article_id: str, article_source: str): return self.db.query(f'SELECT * \ @@ -53,7 +53,7 @@ def update_tweet_db(self, article_id: str, article_source: str, tweet_id: str): } self.articles_table.update(article, ['article_id', 'article_source']) logging.debug('Updated tweet ID in db') - + def get_previous_tweet_id(self, article_id: str, article_source: str): search = self.articles_table.find_one(article_id=article_id, article_source=article_source) if search is None or 'tweet_id' not in search: diff --git a/image_diff_generator.py b/image_diff_generator.py index 0e8567d7a3..97f0a400c4 100644 --- a/image_diff_generator.py +++ b/image_diff_generator.py @@ -10,30 +10,45 @@ class ImageDiffGenerator: - html_template = None + text_diff_template = None + image_diff_template = None driver = None phantomjs_path = None @staticmethod def init(): - if ImageDiffGenerator.html_template is None: - with open("template.html", "r", encoding="utf-8") as html_file: - ImageDiffGenerator.html_template = html_file.read() + if ImageDiffGenerator.text_diff_template is None: + with open("text_template.html", "r", encoding="utf-8") as html_file: + ImageDiffGenerator.text_diff_template = html_file.read() + + with open("image_template.html", "r", encoding="utf-8") as html_file: + ImageDiffGenerator.image_diff_template = html_file.read() ImageDiffGenerator.phantomjs_path = os.environ['PHANTOMJS_PATH'] ImageDiffGenerator.driver = webdriver.PhantomJS(executable_path=ImageDiffGenerator.phantomjs_path) @staticmethod - def generate_image_diff(old: str, new: str, text_to_tweet: str): + def generate_text_diff(old: str, new: str, text_to_tweet: str): ImageDiffGenerator.init() stripped_old = strip_html(old) stripped_new = strip_html(new) new_hash = hashlib.sha224(stripped_new.encode('utf8')).hexdigest() diff_html = html_diff(stripped_old, stripped_new) - html = ImageDiffGenerator.html_template.replace("text_to_tweet", text_to_tweet) \ - .replace("diff_html", diff_html) + html = ImageDiffGenerator.text_diff_template.replace("text_to_tweet", text_to_tweet).replace("diff_html", + diff_html) + return ImageDiffGenerator.generate_image(html, new_hash) + @staticmethod + def generate_image_diff(old: str, new: str, text_to_tweet: str): + ImageDiffGenerator.init() + html = ImageDiffGenerator.image_diff_template.replace("old", old).replace("new", new).replace("text_to_tweet", + text_to_tweet) + new_hash = hashlib.sha224(new.encode('utf8')).hexdigest() + return ImageDiffGenerator.generate_image(html, new_hash) + + @staticmethod + def generate_image(html, diff_hash): with open('tmp.html', 'w', encoding="utf-8") as f: f.write(html) @@ -65,7 +80,7 @@ def generate_image_diff(old: str, new: str, text_to_tweet: str): offset = (int((bg_w - total_width) / 2), int((bg_h - total_height) / 2)) background.paste(img2, offset) - filename = timestamp + new_hash + filename = timestamp + diff_hash saved_file_path = f'./output/{filename}.png' background.save(saved_file_path) return saved_file_path diff --git a/image_template.html b/image_template.html new file mode 100644 index 0000000000..ea1cdf458e --- /dev/null +++ b/image_template.html @@ -0,0 +1,37 @@ + + + + + + + + +
+
+ text_to_tweet: +
+
+
+ +
+
+ ← +
+
+ +
+
+
+

+ + @ILNewsDiff + + כותרת בשינוי אדרת + +

+
+
+ + \ No newline at end of file diff --git a/main.py b/main.py index 78d2acfeb6..94a45c1e49 100644 --- a/main.py +++ b/main.py @@ -39,7 +39,7 @@ def main(): try: logging.debug('Starting Parsers') - parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] + parsers = [HaaretzParser(LOCAL_TZ)]#, IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] for parser in parsers: logging.info(f"Parsing {parser.get_source()}") parser.parse() diff --git a/parsers/haaretz_parser.py b/parsers/haaretz_parser.py index 0f2df178ae..85d8cde48c 100644 --- a/parsers/haaretz_parser.py +++ b/parsers/haaretz_parser.py @@ -25,4 +25,9 @@ def get_tweet_validators(self): return [validators.content_validator] def entry_to_dict(self, article): - return parser_utils.standard_entry_to_dict(article, self.get_source(), self.tz) + article_dict = parser_utils.standard_entry_to_dict(article, self.get_source(), self.tz) + media = None + if len(article.media_content) > 0: + media = article.media_content[0]["url"] + article_dict["image"] = media + return article_dict diff --git a/template.html b/text_template.html similarity index 100% rename from template.html rename to text_template.html From 98fdedcc9a521c0e849b97f306a7ce21966e84f0 Mon Sep 17 00:00:00 2001 From: Alon Date: Sat, 6 Feb 2021 22:43:57 +0200 Subject: [PATCH 3/3] Rony did his CSS magic --- css/styles.css | 50 +++++++++++++++++++++++++++++------------ image_diff_generator.py | 4 ++-- image_template.html | 16 +++++-------- img/arrow_back-24px.svg | 1 + main.py | 2 +- 5 files changed, 45 insertions(+), 28 deletions(-) create mode 100644 img/arrow_back-24px.svg diff --git a/css/styles.css b/css/styles.css index 90bb0f067e..d2543552d2 100755 --- a/css/styles.css +++ b/css/styles.css @@ -23,17 +23,17 @@ p { } del { - background-color: lightpink; - color: black; - text-decoration: line-through; - font-weight: lighter; + background-color: lightpink; + color: black; + text-decoration: line-through; + font-weight: lighter; } ins { - background-color: aquamarine; - color: black; - text-decoration: none; - font-weight: bold; + background-color: aquamarine; + color: black; + text-decoration: none; + font-weight: bold; } img { @@ -43,14 +43,36 @@ img { width: 30px; } +#wrapper { + padding-right: 10px; +} + .alignleft { margin-right: 0em; - font-size: 14px; - text-align:left; - direction:ltr; - color:gray + text-align: left; + direction: ltr; + color: gray } + .alignright { - float:right; -} \ No newline at end of file + float: right; +} + +.row { + display: flex; + flex-direction: row; + align-items: center; + width: 500px; + margin-top: 10px; +} + +.img { + width: 175px; + height: 100px; +} + +.padded { + padding-left: 50px; + padding-right: 50px; +} diff --git a/image_diff_generator.py b/image_diff_generator.py index 97f0a400c4..fd69d74643 100644 --- a/image_diff_generator.py +++ b/image_diff_generator.py @@ -42,7 +42,7 @@ def generate_text_diff(old: str, new: str, text_to_tweet: str): @staticmethod def generate_image_diff(old: str, new: str, text_to_tweet: str): ImageDiffGenerator.init() - html = ImageDiffGenerator.image_diff_template.replace("old", old).replace("new", new).replace("text_to_tweet", + html = ImageDiffGenerator.image_diff_template.replace("old_img", old).replace("new_img", new).replace("text_to_tweet", text_to_tweet) new_hash = hashlib.sha224(new.encode('utf8')).hexdigest() return ImageDiffGenerator.generate_image(html, new_hash) @@ -62,7 +62,7 @@ def generate_image(html, diff_hash): block_width = e.size['width'] end_width = start_width total_height = start_height + block_height + end_height - total_width = 510 # Override because body width is set to 500 + total_width = 520 # Override because body width is set to 500 timestamp = str(int(time.time())) ImageDiffGenerator.driver.save_screenshot('./tmp.png') img = Image.open('./tmp.png') diff --git a/image_template.html b/image_template.html index ea1cdf458e..26f391cf34 100644 --- a/image_template.html +++ b/image_template.html @@ -11,17 +11,11 @@ text_to_tweet:
-
- -
-
- ← -
-
- -
+ + + back-arrow +

diff --git a/img/arrow_back-24px.svg b/img/arrow_back-24px.svg new file mode 100644 index 0000000000..26300ebf08 --- /dev/null +++ b/img/arrow_back-24px.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/main.py b/main.py index 94a45c1e49..78d2acfeb6 100644 --- a/main.py +++ b/main.py @@ -39,7 +39,7 @@ def main(): try: logging.debug('Starting Parsers') - parsers = [HaaretzParser(LOCAL_TZ)]#, IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] + parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)] for parser in parsers: logging.info(f"Parsing {parser.get_source()}") parser.parse()