feat: idealo scraper

This commit is contained in:
2026-05-25 14:05:16 +00:00
parent ed7c7c7bbf
commit f86055b85d
3 changed files with 2718 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
import * as cheerio from 'cheerio'
import type { PriceScraper, ScrapeResult } from './types'
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
export const idealoScraper: PriceScraper = {
shop: 'idealo',
async scrape(url: string): Promise<ScrapeResult> {
try {
const res = await fetch(url, {
headers: {
'User-Agent': UA,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'de-DE,de;q=0.9',
},
signal: AbortSignal.timeout(20_000),
})
if (!res.ok) {
return { price: null, currency: 'EUR', availability: 'unknown', error: `HTTP ${res.status}` }
}
const $ = cheerio.load(await res.text())
const priceTexts = [
$('[data-testid="detail-offer-price"]').first().text(),
$('meta[itemprop="price"]').attr('content'),
$('span.oopStage-conditionButton-price').first().text(),
$('strong.oopStage-price').first().text(),
$('span.oopStage-priceRangePrice').first().text(),
$('.oop-productOfferOutbox-priceAmount').first().text(),
extractJsonLdPrice($),
].filter(Boolean) as string[]
const price = parsePrice(priceTexts[0] ?? '')
const name = (
$('h1#oopStage-title span').first().text()
|| $('h1[data-testid="product-title"]').text()
|| $('h1').first().text()
|| ''
).trim()
const imageUrl = $('meta[property="og:image"]').attr('content') || undefined
if (price === null) {
return { price: null, currency: 'EUR', availability: 'unknown', name, imageUrl, error: 'price-selector-missed' }
}
return { price, currency: 'EUR', availability: 'in_stock', name, imageUrl }
} catch (err) {
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
}
},
}
function extractJsonLdPrice($: cheerio.CheerioAPI): string | null {
const scripts = $('script[type="application/ld+json"]')
for (let i = 0; i < scripts.length; i++) {
const raw = $(scripts[i]).contents().text()
if (!raw) continue
try {
const data = JSON.parse(raw)
const offers = data?.offers
if (offers) {
const price = offers.price ?? offers.lowPrice ?? offers.highPrice
if (typeof price === 'number' && price > 0) return String(price)
if (typeof price === 'string' && price) return price
}
} catch {
// ignore malformed JSON-LD
}
}
return null
}
function parsePrice(text: string): number | null {
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
if (!cleaned) return null
const n = parseFloat(cleaned)
return Number.isFinite(n) && n > 0 ? n : null
}

2611
tests/fixtures/idealo-headphones.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,30 @@
import { describe, it, expect, vi, beforeEach } from 'vitest'
import { readFileSync } from 'node:fs'
import { join } from 'node:path'
import { idealoScraper } from '@/lib/scrapers/idealo'
const fixture = readFileSync(join(__dirname, '../fixtures/idealo-headphones.html'), 'utf-8')
beforeEach(() => {
global.fetch = vi.fn().mockResolvedValue({
ok: true, status: 200, text: async () => fixture,
}) as unknown as typeof fetch
})
describe('idealoScraper', () => {
it('extracts price and name', async () => {
const r = await idealoScraper.scrape('https://www.idealo.de/foo')
expect(r.price).toBeGreaterThan(0)
expect(r.currency).toBe('EUR')
expect(r.name).toBeTruthy()
})
it('flags cloudflare challenge', async () => {
global.fetch = vi.fn().mockResolvedValue({
ok: false, status: 403, text: async () => '<html>Cloudflare</html>',
}) as unknown as typeof fetch
const r = await idealoScraper.scrape('https://www.idealo.de/foo')
expect(r.price).toBeNull()
expect(r.error).toMatch(/403|cloudflare/i)
})
})