feat: idealo scraper
This commit is contained in:
77
src/lib/scrapers/idealo.ts
Normal file
77
src/lib/scrapers/idealo.ts
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
import * as cheerio from 'cheerio'
|
||||||
|
import type { PriceScraper, ScrapeResult } from './types'
|
||||||
|
|
||||||
|
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
|
||||||
|
|
||||||
|
export const idealoScraper: PriceScraper = {
|
||||||
|
shop: 'idealo',
|
||||||
|
async scrape(url: string): Promise<ScrapeResult> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': UA,
|
||||||
|
'Accept': 'text/html,application/xhtml+xml',
|
||||||
|
'Accept-Language': 'de-DE,de;q=0.9',
|
||||||
|
},
|
||||||
|
signal: AbortSignal.timeout(20_000),
|
||||||
|
})
|
||||||
|
if (!res.ok) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', error: `HTTP ${res.status}` }
|
||||||
|
}
|
||||||
|
const $ = cheerio.load(await res.text())
|
||||||
|
|
||||||
|
const priceTexts = [
|
||||||
|
$('[data-testid="detail-offer-price"]').first().text(),
|
||||||
|
$('meta[itemprop="price"]').attr('content'),
|
||||||
|
$('span.oopStage-conditionButton-price').first().text(),
|
||||||
|
$('strong.oopStage-price').first().text(),
|
||||||
|
$('span.oopStage-priceRangePrice').first().text(),
|
||||||
|
$('.oop-productOfferOutbox-priceAmount').first().text(),
|
||||||
|
extractJsonLdPrice($),
|
||||||
|
].filter(Boolean) as string[]
|
||||||
|
|
||||||
|
const price = parsePrice(priceTexts[0] ?? '')
|
||||||
|
const name = (
|
||||||
|
$('h1#oopStage-title span').first().text()
|
||||||
|
|| $('h1[data-testid="product-title"]').text()
|
||||||
|
|| $('h1').first().text()
|
||||||
|
|| ''
|
||||||
|
).trim()
|
||||||
|
const imageUrl = $('meta[property="og:image"]').attr('content') || undefined
|
||||||
|
|
||||||
|
if (price === null) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', name, imageUrl, error: 'price-selector-missed' }
|
||||||
|
}
|
||||||
|
return { price, currency: 'EUR', availability: 'in_stock', name, imageUrl }
|
||||||
|
} catch (err) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractJsonLdPrice($: cheerio.CheerioAPI): string | null {
|
||||||
|
const scripts = $('script[type="application/ld+json"]')
|
||||||
|
for (let i = 0; i < scripts.length; i++) {
|
||||||
|
const raw = $(scripts[i]).contents().text()
|
||||||
|
if (!raw) continue
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(raw)
|
||||||
|
const offers = data?.offers
|
||||||
|
if (offers) {
|
||||||
|
const price = offers.price ?? offers.lowPrice ?? offers.highPrice
|
||||||
|
if (typeof price === 'number' && price > 0) return String(price)
|
||||||
|
if (typeof price === 'string' && price) return price
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore malformed JSON-LD
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
function parsePrice(text: string): number | null {
|
||||||
|
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
|
||||||
|
if (!cleaned) return null
|
||||||
|
const n = parseFloat(cleaned)
|
||||||
|
return Number.isFinite(n) && n > 0 ? n : null
|
||||||
|
}
|
||||||
2611
tests/fixtures/idealo-headphones.html
vendored
Normal file
2611
tests/fixtures/idealo-headphones.html
vendored
Normal file
File diff suppressed because one or more lines are too long
30
tests/scrapers/idealo.test.ts
Normal file
30
tests/scrapers/idealo.test.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import { describe, it, expect, vi, beforeEach } from 'vitest'
|
||||||
|
import { readFileSync } from 'node:fs'
|
||||||
|
import { join } from 'node:path'
|
||||||
|
import { idealoScraper } from '@/lib/scrapers/idealo'
|
||||||
|
|
||||||
|
const fixture = readFileSync(join(__dirname, '../fixtures/idealo-headphones.html'), 'utf-8')
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
global.fetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: true, status: 200, text: async () => fixture,
|
||||||
|
}) as unknown as typeof fetch
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('idealoScraper', () => {
|
||||||
|
it('extracts price and name', async () => {
|
||||||
|
const r = await idealoScraper.scrape('https://www.idealo.de/foo')
|
||||||
|
expect(r.price).toBeGreaterThan(0)
|
||||||
|
expect(r.currency).toBe('EUR')
|
||||||
|
expect(r.name).toBeTruthy()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('flags cloudflare challenge', async () => {
|
||||||
|
global.fetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: false, status: 403, text: async () => '<html>Cloudflare</html>',
|
||||||
|
}) as unknown as typeof fetch
|
||||||
|
const r = await idealoScraper.scrape('https://www.idealo.de/foo')
|
||||||
|
expect(r.price).toBeNull()
|
||||||
|
expect(r.error).toMatch(/403|cloudflare/i)
|
||||||
|
})
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user