Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,5 +189,4 @@ dmypy.json

*.log
titles.db*
*.html
*.png
47 changes: 25 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,47 +1,50 @@
# ILNewsDiff

See feed here: https://twitter.com/ILNewsDiff
See feed here: <https://twitter.com/ILNewsDiff>

**Interested in contributing to this project? Send us a direct message on [twitter](https://twitter.com/ILNewsDiff)**

A Twitter bot that keeps track of changes made in Israeli news websites.

Currently tracking:
* [Haaretz](https://www.haaretz.co.il/)
* [Israel Hayom](https://Israelhayom.co.il/)
* [Walla](https://www.walla.co.il/)

How does it work?
------------

* [Haaretz](https://www.haaretz.co.il/)
* [Israel Hayom](https://Israelhayom.co.il/)
* [Walla](https://www.walla.co.il/)

## How does it work?

Once a minute the code queries news feeds and compares them to a previous state saved in a local SQLite DB.

If an _interesting_ change is found, a tweet is published with the diff.

The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order.
The first tweet of a diff is always the article itself as a link, and all the subsequent changes are chained by order.

### What is _interesting_?

A change that
* Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change.
* Is not comprised of only whitespace or punctuation.
* Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting)


Installation
------------
+ The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file.
+ `pip install -r requirements.txt`
* Has happened and is not there because of a delay in the RSS feed. The code queries the article's page to look for the change.
* Is not comprised of only whitespace or punctuation.
* Has a difference of more than one letter (Though adding/removing a question mark '?' is interesting)

## Installation

* The [phantomjs](http://phantomjs.org/) binary needs to be installed, and the path updated in the run_diff.sh file.
* `pip install -r requirements.txt`

[Twitter keys](https://dev.twitter.com/) are needed.

Credits
-------
## Credits

For contributing to this repo:

* [@yuvalpinter](https://github.com/yuvalpinter)

Based on @j-e-d's code for the Twitter bot [@nyt_diff](https://twitter.com/nyt_diff).
RSS feed fetching added for @xuv's Twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff)

+ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/
+ RSS fetching: @xuv Julien Deswaef http://xuv.be
+ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
+ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
* Original script and idea: @j-e-d Juan E.D. <http://unahormiga.com/>
* RSS fetching: @xuv Julien Deswaef <http://xuv.be>
* Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
* Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
16 changes: 12 additions & 4 deletions data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@


class DataProvider:
def __init__(self):
self.db = dataset.connect('sqlite:///titles.db')
def __init__(self, db_path="titles.db"):
self.db = dataset.connect(f'sqlite:///{db_path}')
self.articles_table = self.db['rss_ids']
self.versions_table = self.db['rss_versions']

Expand All @@ -25,9 +25,17 @@ def track_article(self, data: dict):
self.versions_table.insert(data)
logging.info(f"New article tracked: {data['url']}")

def get_article_version_count(self, artice_id: str, article_source: str, hash: str):
def get_article(self, article_id):
return self.articles_table.find_one(id=article_id)

def get_article_version_count_ex(self, article_id: str, article_source: str):
return self.versions_table.count(
self.versions_table.table.columns.article_id == article_id,
article_source=article_source)

def get_article_version_count(self, article_id: str, article_source: str, hash: str):
return self.versions_table.count(
self.versions_table.table.columns.article_id == artice_id,
self.versions_table.table.columns.article_id == article_id,
article_source=article_source,
hash=hash)

Expand Down
2 changes: 1 addition & 1 deletion css/styles.css → image_generator/css/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
}

body {
background: lightgray url('../img/paper_fibers.png') repeat;
background: lightgray url('../base_image/paper_fibers.png') repeat;
font-family: Merriweather;
font-size: 16px;
direction: rtl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,12 @@ def generate_image_diff(old: str, new: str, text_to_tweet: str):
img = Image.open('./tmp.png')
img2 = img.crop((0, 0, total_width, total_height))
if int(total_width) > int(total_height * 2):
background = Image.new('RGBA', (total_width, int(total_width / 2)),
(255, 255, 255, 0))
background = Image.new('RGBA', (total_width, int(total_width / 2)), (255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
else:
background = Image.new('RGBA', (total_width, total_height),
(255, 255, 255, 0))
background = Image.new('RGBA', (total_width, total_height), (255, 255, 255, 0))
bg_w, bg_h = background.size
offset = (int((bg_w - total_width) / 2),
int((bg_h - total_height) / 2))
Expand Down
File renamed without changes.
Binary file removed img/paper_fibers.png
Binary file not shown.
Binary file removed img/twitter.png
Binary file not shown.
27 changes: 27 additions & 0 deletions loggers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
import os
import sys

if 'LOG_FOLDER' in os.environ:
LOG_FOLDER = os.environ['LOG_FOLDER']
else:
LOG_FOLDER = ''


def setup_loggers():
# Add Handlers
logging_filehandler = logging.FileHandler(filename=os.path.join(LOG_FOLDER, 'titlediff.log'), encoding='utf-8', mode='a+')
handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)]
logging.basicConfig(handlers=handlers,
format='%(asctime)s %(name)13s %(levelname)8s: %(message)s',
level=logging.DEBUG)

# Handle some loggers
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger("requests_oauthlib").setLevel(logging.WARNING)
logging.getLogger("chardet").setLevel(logging.WARNING)
logging.getLogger("tweepy").setLevel(logging.WARNING)
logging.getLogger("oauthlib").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
46 changes: 10 additions & 36 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,29 @@
#!/usr/bin/python3

import logging
import os
import sys

from pytz import timezone

from parsers.haaretz_parser import HaaretzParser
from parsers.israel_hayom_parser import IsraelHayomParser
from parsers.walla_parser import WallaParser
from loggers import setup_loggers

TIMEZONE = 'Israel'
LOCAL_TZ = timezone(TIMEZONE)

if 'LOG_FOLDER' in os.environ:
LOG_FOLDER = os.environ['LOG_FOLDER']
else:
LOG_FOLDER = ''
PARSER_CLASSES = [HaaretzParser, IsraelHayomParser, WallaParser]


def main():
# logging
logging_filehandler = logging.FileHandler(filename=LOG_FOLDER + 'titlediff.log',
encoding='utf-8', mode='a+')
handlers = [logging_filehandler, logging.StreamHandler(sys.stdout)]
logging.basicConfig(handlers=handlers,
format='%(asctime)s %(name)13s %(levelname)8s: %(message)s',
level=logging.DEBUG)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger("requests_oauthlib").setLevel(logging.WARNING)
logging.getLogger("chardet").setLevel(logging.WARNING)
logging.getLogger("tweepy").setLevel(logging.WARNING)
logging.getLogger("oauthlib").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

setup_loggers()
logging.info('Starting script')

try:
logging.debug('Starting Parsers')
parsers = [HaaretzParser(LOCAL_TZ), IsraelHayomParser(LOCAL_TZ), WallaParser(LOCAL_TZ)]
for parser in parsers:
logging.info(f"Parsing {parser.get_source()}")
parser.parse()
logging.debug('Finished')
except Exception:
logging.exception('Parser')

logging.info('Finished script')
logging.debug('Starting Parsers')
parsers = [parser_class(LOCAL_TZ) for parser_class in PARSER_CLASSES]
for parser in parsers:
logging.info(f"Parsing {parser.get_source()}")
parser.parse()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont u need - try except ?


logging.debug('Finished')


if __name__ == "__main__":
Expand Down
Empty file removed output/.gitignore
Empty file.
6 changes: 3 additions & 3 deletions base_parser.py → parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from data_provider import DataProvider
from twitter_helper import upload_media, tweet_text, tweet_with_media
from image_diff_generator import ImageDiffGenerator
from image_generator.image_diff_generator import ImageDiffGenerator

if 'TESTING' in os.environ:
if os.environ['TESTING'] == 'False':
Expand Down Expand Up @@ -71,8 +71,7 @@ def tweet(self, text: str, article_id: str, url: str, image_path: str):

def store_data(self, data: Dict):
if self.data_provider.is_article_tracked(data['article_id'], self.get_source()):
count = self.data_provider.get_article_version_count(data[
'article_id'], self.get_source(), data['hash'])
count = self.data_provider.get_article_version_count(data['article_id'], self.get_source(), data['hash'])
if count != 1: # Changed
self.tweet_all_changes(data)
else:
Expand Down Expand Up @@ -121,6 +120,7 @@ def loop_entries(self, entries):
articles[article_dict['article_id']] = article_dict
except BaseException:
logging.exception(f'Problem looping entry: {article}')

for article_dict in articles.values():
try:
self.store_data(article_dict)
Expand Down
1 change: 1 addition & 0 deletions parsers/parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def standard_entry_to_dict(article, source, tz, strip_description=False):
else:
article_dict['abstract'] = article['description']
od = collections.OrderedDict(sorted(article_dict.items()))
# The bug of the key??
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should fix the bug in the feature extractors and not here.
I have already fixed it in my branch

article_dict['hash'] = hashlib.sha224(repr(od.items()).encode('utf-8')).hexdigest()
article_dict['date_time'] = datetime.now(tz)
return article_dict
5 changes: 2 additions & 3 deletions rss_parser.py → parsers/rss_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging

import feedparser

from base_parser import BaseParser
Expand All @@ -16,6 +15,6 @@ def parse(self):
if r is None:
logging.warning('Empty response RSS')
return
else:
logging.info('Parsing %s', r.channel.title)

logging.info('Parsing %s', r.channel.title)
self.loop_entries(r.entries)
11 changes: 11 additions & 0 deletions process_data/csv_data_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os
import pandas as pd


class CsvDataProvider:
def __init__(self, data_files=r"../csvs", version=0):
self._data_files_dir = data_files
self.articles = pd.read_csv(os.path.join(data_files, "articles.csv"))
self.versions_file_format = os.path.join(data_files, "versions_{version}.csv")
self.versions = pd.read_csv(self.versions_file_format.format(version=version))

12 changes: 12 additions & 0 deletions process_data/feature_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

class FeatureExtractor(object):
def __init__(self) -> None:
pass

@staticmethod
def get_cols() -> list:
return []

@staticmethod
def extract(data, previous_data, article):
pass
8 changes: 8 additions & 0 deletions process_data/printer_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from feature_extractor import FeatureExtractor


class PrinterExtractor(FeatureExtractor):
@staticmethod
def extract(data, previous_data, article):
print(data)
return []
77 changes: 77 additions & 0 deletions process_data/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import argparse
import itertools

import pandas as pd

from printer_extractor import PrinterExtractor
from word_token_extractor import WordsTokensExtractor
from csv_data_provider import CsvDataProvider

COLUMNS_TO_KEEP = ["id", "version", "article_id", "article_source", "title", "amount_of_words"]

ROW_EXTRACTED = {
# "amount_of_words": WordsTokensExtractor.how_many_words
}

FEATURE_EXTRACTORS = [
# WordsTokensExtractor,
# PrinterExtractor
]


def process_data(data_files, clean_csv, out_version, in_version=0):
# TODO: choose which features to extract
dt = CsvDataProvider(data_files, in_version)

if clean_csv:
# The data we want to keep from the csvs
extracted_features = dt.versions[COLUMNS_TO_KEEP]
else:
extracted_features = dt.versions

# cols based on the same row data -
for col_name, f in ROW_EXTRACTED.items():
extracted_features[col_name] = dt.versions.apply(f, axis=1, result_type="expand")

if len(FEATURE_EXTRACTORS) > 0:
# cols based on a lot of data -
cols = list(itertools.chain.from_iterable([extractor.get_cols() for extractor in FEATURE_EXTRACTORS]))
processed_data = pd.DataFrame(columns=cols)
row_id = 0

# Extract Features
for _id, article in dt.articles.iterrows():
article_versions = dt.versions[(dt.versions["article_id"] == article["article_id"]) &
(dt.versions["article_source"] == article["article_source"])]

for __id, single_version in article_versions.iterrows():
past_versions = article_versions[article_versions["version"] < single_version["version"]]
processed_row = []
for feature_extractor in FEATURE_EXTRACTORS:
processed_row.extend(feature_extractor.extract(single_version, past_versions, article))

processed_data.loc[row_id] = processed_row
row_id += 1

# TODO: export back to csv
print(f"Max words - {extracted_features['amount_of_words'].max()}")
out_file = dt.versions_file_format.format(version=out_version)
extracted_features.to_csv(out_file)

print("Done!!")


def main():
parser = argparse.ArgumentParser(description='Process some data.')
parser.add_argument('datafiles', help='A path to the folders with the csvs')
parser.add_argument('--in-version', default=0, help='Which version to use')
parser.add_argument('--out-version', help='Which version to write')

parser.add_argument('--clean', action="store_true", help='Should keep only important data')

args = parser.parse_args()
process_data(args.datafiles, args.clean, args.out_version, args.in_version)


if __name__ == "__main__":
main()
Loading