Function isn't running when posted in api fy | Apify & Crawlee | Page 1

rich hound Oct 9, 2023, 7:44 PM

#

Hello everyone,

I have the following code working perfectly when I am not using Apify however once I use Apify it doesn't run the second function. I am using the apify template for Scrapy .

Thanks for the help

from typing import Generator
from scrapy.responsetypes import Response
from apify import Actor
from urllib.parse import urljoin
import nest_asyncio
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from scrapy.http import Response, Request

class TitleSpider(scrapy.Spider):
    name = 'example'
    allowed_domains = ['apc.fr']
    start_urls = [
        "https://www.apc.fr/men/men-shirts.html",
    ]

    def parse(self, response: Response):
        Actor.log.info(f'TitleSpider is parsing {response}...')
        li_elements = response.css('li.product-item')
        product_links = []
        for li in li_elements:
            productlink_container = li.css('.product-link')
            product_link = productlink_container.css('a::attr(href)').get()
            if product_link:
                product_links.append(product_link)

        for link in product_links:
            yield scrapy.Request(url=link, callback=self.second_page)

    def second_page(self, response: Response):
        Actor.log.info(f'Second fonction is parsing {response}...')
        productname = response.css('h1.product-name::text').get()
        print(productname)

buoyant sleet Oct 13, 2023, 1:31 PM

#

Hey, thanks for the report! The team knows about the issue and will fix it

rich hound Oct 15, 2023, 5:03 PM

#

Thanks for your answer. Any ETA for this resolution? Thanks

rich hound Oct 16, 2023, 8:15 AM

#

In the meantime and if it can help I found this solution that somehow works.

class TitleSpider(scrapy.Spider):
    name = 'title_spider'
    allowed_domains = ['apc.fr']
    start_urls = [
        "https://www.apc.fr/women/women-blouses-shirts.html",
    ]
    test_url = None

    def parse(self, response: Response):
        if 'isProductPage' in response.meta:
            yield from self._parse_prod(response) 
        else:
            Actor.log.info(f'Main page => {response}...')
            li_elements = response.css('li.product-item')
            for li in li_elements:
                productlink_container = li.css('.product-link')
                product_links = productlink_container.css('a::attr(href)').getall()
            
                for link in product_links:
                    self.test_url = response.url
                    yield scrapy.Request(link, meta={"isProductPage": True})

    def _parse_prod(self, response: Response):
        Actor.log.info(f'Product Page => {response}...')
        test_url = self.test_url      
        current_url = response.url
        productname = response.css('h1.product-name::text').get()
        productdescriptionfirst = response.css('div.product.attribute.intro')
        productdescriptionsecond = productdescriptionfirst.css('div.value')
        productdescription = 
        }'
#Other stuffs ....

#Function isn't running when posted in api fy