News Application using Neurelo’s Python SDKs

feeds.json

{
  "trending": [
    "https://feeds.bbci.co.uk/news/rss.xml",
    "https://www.cbc.ca/webfeed/rss/rss-topstories"
  ]
}

fetch.py

import feedparser
import scrapy
from scrapy.crawler import CrawlerProcess
import json
from newspaper import fulltext
import requests
from dotenv import load_dotenv
import os
import argparse

from neurelo.configuration import Configuration
from neurelo.api_client import ApiClient
from neurelo.api.news_api import NewsApi

scraped = {}

def start_scrapy(urls):
    process = CrawlerProcess(settings={"FEED_FORMAT": "json", "FEED_URI": "items.json"})
    process.crawl(Scrape, kwargs={"urls": urls})
    process.start()

class Scrape(scrapy.Spider):
    def __init__(self, **kwargs):
        self.name = ""
        self.urls = kwargs["kwargs"]["urls"]

    def start_requests(self):
        for url in self.urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        try:
            scraped[response.request.meta["redirect_urls"][0]] = response.text
        except:
            scraped[response.url] = response.text

class Parse:
    def __init__(self):
        pass

    def fetch_feeds(self):
        feeds = json.load(open("./feeds.json"))
        topics = list(feeds.keys())
        return (feeds, topics)

    def parse_rss_feeds(self, feeds, topics, limit=5):
        content = {}
        need_to_scrape = []
        index = 0
        for topic in topics:
            for feed in feeds[topic]:
                fetched = feedparser.parse(feed)
                for news in fetched.entries[:limit]:
                    content[index] = {}
                    content[index]["title"] = news["title"]
                    content[index]["link"] = news["link"]
                    content[index]["published"] = news["published"]
                    content[index]["category"] = topic
                    need_to_scrape.append(news["link"])
                    index += 1
        return (content, need_to_scrape)

    def get_news(self, rss_content):
        for index in rss_content.keys():
            feed_url = rss_content[index]["link"]
            scraped_news = fulltext(scraped[feed_url], language="en")
            rss_content[index]["summary"] = scraped_news
        return rss_content

    def parse(self):
        rss_feeds, topics = self.fetch_feeds()
        content, need_to_scrape = self.parse_rss_feeds(rss_feeds, topics)
        start_scrapy(need_to_scrape)
        return self.get_news(content)

class News:
    def __init__(self) -> None:
        host, key = self.env()
        self.api_client = self.conf(host, key)

    def env(self):
        load_dotenv()
        return (os.getenv("NEURELO_API_HOST") or "", os.getenv("NEURELO_API_KEY") or "")

    def conf(self, host, key):
        configuration = Configuration(host, api_key={"ApiKey": key})
        return ApiClient(configuration=configuration)

    def fetch(self):
        news_api = NewsApi(self.api_client)
        news_item = news_api.find_news()
        return news_item

    def store(self):
        parse = Parse()
        news = parse.parse()
        articles = list(news.values())

        news_api = NewsApi(self.api_client)
        response = news_api.create_many_news(articles)
        return response

    def delete(self):
        news_api = NewsApi(self.api_client)
        deleted = news_api.delete_news()
        return deleted

if __name__ == "__main__":
    news = News()
    print(news.fetch())

Last updated