init
This commit is contained in:
97
full_analysis.py
Normal file
97
full_analysis.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from app import db, app
|
||||
from app.models.site import Site
|
||||
from app.models.article import Article
|
||||
|
||||
import feedparser
|
||||
import trafilatura
|
||||
import httpx
|
||||
from pprint import pprint
|
||||
|
||||
from sumy.parsers.plaintext import PlaintextParser
|
||||
from sumy.nlp.tokenizers import Tokenizer
|
||||
from sumy.summarizers.lsa import LsaSummarizer
|
||||
|
||||
|
||||
def get_all_sites() -> list[Site]:
|
||||
with app.app_context():
|
||||
all_sites = Site.query.all()
|
||||
return all_sites
|
||||
|
||||
def print_sites(sites:list[Site]) -> None:
|
||||
for site in sites:
|
||||
print(site.name)
|
||||
print(site.base_url)
|
||||
print(site.feed_url)
|
||||
print("*" * 10)
|
||||
|
||||
def analyze_site(site: Site) -> list[dict]:
|
||||
feed_url = site.feed_url
|
||||
feed = feedparser.parse(feed_url)
|
||||
links = get_article_links_from_feed(feed)
|
||||
return links
|
||||
|
||||
def get_article_links_from_feed(feed: feedparser.util.FeedParserDict) -> list[dict]:
|
||||
links: list[dict] = []
|
||||
for entry in feed["entries"]:
|
||||
tmp_dict = {
|
||||
"title": entry["title"],
|
||||
"link": entry["link"]
|
||||
}
|
||||
links.append(tmp_dict)
|
||||
return links
|
||||
|
||||
|
||||
def analyze_sites(sites:list[Site]):
|
||||
results = []
|
||||
for site in sites:
|
||||
articles = analyze_site(site)
|
||||
import_articles(articles, site)
|
||||
results.append(articles)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def import_articles(articles: list[dict], site:Site) -> bool:
|
||||
success = True
|
||||
for article in articles:
|
||||
parsed_article = parse_article(article, site)
|
||||
article = Article(title=parsed_article["title"], url=parsed_article["url"], raw_content=parsed_article["raw_content"], summarized_content=parsed_article["summarized_content"],debloated_content=parsed_article["debloated_content"], site=site)
|
||||
with app.app_context():
|
||||
try:
|
||||
db.session.add(article)
|
||||
db.session.commit()
|
||||
print(f"[+] Written {article.title}")
|
||||
except Exception as e:
|
||||
print(f"[-] Failed to write {article.title}")
|
||||
return success
|
||||
|
||||
def summarize_article(text, language="english", sentences_count=5):
|
||||
parser = PlaintextParser.from_string(text, Tokenizer(language))
|
||||
summarizer = LsaSummarizer()
|
||||
summary = summarizer(parser.document, sentences_count)
|
||||
return ' '.join([str(sentence) for sentence in summary])
|
||||
|
||||
def parse_article(article:dict, site:Site) -> dict:
|
||||
resp = httpx.get(article["link"])
|
||||
article_raw = resp.text
|
||||
article_debloated = trafilatura.extract(article_raw)
|
||||
article_summary = summarize_article(article_debloated)
|
||||
return {
|
||||
"title": article["title"],
|
||||
"url":article["link"],
|
||||
"raw_content": article_raw,
|
||||
"debloated_content": article_debloated,
|
||||
"summarized_content": article_summary,
|
||||
}
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
sites = get_all_sites()
|
||||
results = analyze_sites(sites)
|
||||
pprint(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Reference in New Issue
Block a user