#News Application Scraping and Saving Data using Celery Beat [Code Review]
14 messages · Page 1 of 1 (latest)
@shared_task()
def tc_get_news():
"""Scrapes starting from list page and going over detailed pages creates a Article model"""
from .models import Article, Domain
url = "https://techcrunch.com/latest/"
page = requests.get(url).text
soup = bs4.BeautifulSoup(page, "html.parser")
links_tag = soup.find_all(class_="loop-card__title-link")[:20]
for link in links_tag:
breif = False
class_list = link.find_parent("li")["class"]
if link.parent.parent.find(class_="loop-card__cat-group").find("span"):
breif = True # The article type is breif
title = link.text
article_url = link.attrs["href"]
image_url, pub, short, content, author = tc_get_detailed(
article_url, breif=breif
)
if (
not content
): # if the detailed page have a different structure we get content None so skip that Article
continue
published = datetime.fromisoformat(pub)
if Article.objects.filter(title=title).exists(): # Prevent duplicate articles
continue
print(title, short, content, image_url, published, article_url, author)
summary = make_summary(content)
article = Article(
title=title,
short_description=short,
content=content,
image_url=image_url,
post_published=published,
source_link=article_url,
author=author,
summary=summary,
)
tc_get_category(class_list, article, Domain)
article.save()
def tc_get_detailed(url, breif):
page = requests.get(url).text
soup = bs4.BeautifulSoup(page, "html.parser")
paras, image_url, pub_date, short_des, content = None, None, None, None, None
try:
image_url = soup.find(class_="wp-post-image").attrs["src"]
pub_date = soup.find(class_="wp-block-post-date").find("time").attrs["datetime"]
paras = soup.find_all(class_="wp-block-paragraph")
except Exception as e:
print("Different Pattern", e)
if paras:
content = ""
for para in paras[1:]:
content += para.text
if not breif:
short_des = paras[0].text
author = soup.find(class_="wp-block-tc23-author-card-name__link").text
else:
short_des = content[:350]
author = soup.find(class_="post-authors-list__author").text
return image_url, pub_date, short_des, content, author
def tc_get_category(class_list, article, Tag):
for cls in class_list:
if cls.startswith("category"):
category = cls.replace("category-", "")
tag, created = Tag.objects.get_or_create(category=category)
article.domain = tag```
def make_summary(content):
try:
time.sleep(10) #gemini rate limit on free tier
url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
api_key = os.getenv("gem_api_key", "")
params = {"key": api_key}
headers = {"Content-Type": "application/json"}
data = {
"contents": [
{
"parts": [
{
"text": "Provide a summary for the following Article,Give the Summary directly"
},
{"text": content},
]
}
]
}
response = requests.post(
url, params=params, headers=headers, data=json.dumps(data)
)
summary = response.json()["candidates"][0]["content"]["parts"][0]["text"]
return summary
except (requests.RequestException, KeyError) as e:
print("An error occured", e)```
There's not much to review my friend from django's perspective. If it works, then great job, that's all you should care about.
The only thing I could comment on is possibly dropping the save() in favor of a One-shot bulk_create.
(unless you have overridden save(...) to do something important)
Ok got it, first time doing something like this. was afraid am doing something wrong😅
Using something like request with celery is ok ig. dont need something like async request libs
If you have lots of requests to "fetch", then yes async requests will be massively more performant; Judging by what I'm seeing, you only issue 1 request (?) to techcrunch. then the rest of the work is spent making API calls to generativelanguage.googleapis.com
I would approach it differently.
I'd use a beat to do the scheduled extractions and then push X number of tasks (1 task per link) to another queue that will be responsible to create the article in the DB AND issue the requests to Gemini.
Why? That way, I can have N amount of workers working on M number of tasks in parallel. You'll finish in no time.
P.S. Careful with the rate limits.
Oh understood 👍
Am making requests to detailed pages of the articles in techcrunch too
You're right, I missed your sub-requests.
# [1] None Async (Current)
Main Task
> GET ONE
> LOOP
> GET ONE
> GENERATE ONE
If you did not have "GENERATE ONE", you could benefit from straight up async requests.
# [2] Async
Main Task
> GET ONE
> ASYNC LOOP
> ASYNC GET ONE
> ASYNC GENERATE ONE (POST) (Check for streamed responses, I don't know how it works)
# [3] Parallel (Simplest solution for performant result)
Main Task
> GET ONE
> LOOP
> SCHEDULE ONE
Processing Task
> GET ONE
> GENERATE ONE
My recommendation is #3 (what i said before)
Thanks, makes sense
Oh, you should use acks_late=False because your task is not idempotent. This means that in the event of worker failure, the celery will re-run your "semi-executed" task if the acks_late=True. This re-run can possibly store duplicate values in database.
Also use celery events like task_success, task_prerun, task_postrun etc... to update task status. The event handlers should update progress in database.