feat: geizhals scraper with cheerio + tests

This commit is contained in:
2026-05-25 14:01:12 +00:00
parent 890fdecf24
commit ed7c7c7bbf
3 changed files with 3837 additions and 0 deletions

View File

@@ -0,0 +1,48 @@
import * as cheerio from 'cheerio'
import type { PriceScraper, ScrapeResult } from './types'
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
export const geizhalsScraper: PriceScraper = {
shop: 'geizhals',
async scrape(url: string): Promise<ScrapeResult> {
try {
const res = await fetch(url, {
headers: { 'User-Agent': UA, 'Accept-Language': 'de-DE,de;q=0.9' },
signal: AbortSignal.timeout(20_000),
})
if (!res.ok) {
return { price: null, currency: 'EUR', availability: 'unknown', error: `HTTP ${res.status}` }
}
const $ = cheerio.load(await res.text())
const priceTexts = [
$('.gh_price').first().text(),
$('span.gh_price strong').first().text(),
$('[itemprop="price"]').attr('content'),
$('meta[itemprop="price"]').attr('content'),
].filter(Boolean) as string[]
const price = parsePrice(priceTexts[0] ?? '')
const name = ($('h1[itemprop="name"]').text() || $('h1').first().text() || '').trim()
const imageUrl = $('img.product-gallery__image').first().attr('src')
|| $('meta[property="og:image"]').attr('content')
|| undefined
if (price === null) {
return { price: null, currency: 'EUR', availability: 'unknown', name, imageUrl, error: 'price-selector-missed' }
}
return { price, currency: 'EUR', availability: 'in_stock', name, imageUrl }
} catch (err) {
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
}
},
}
function parsePrice(text: string): number | null {
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
if (!cleaned) return null
const n = parseFloat(cleaned)
return Number.isFinite(n) && n > 0 ? n : null
}

3757
tests/fixtures/geizhals-gpu.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,32 @@
import { describe, it, expect, vi, beforeEach } from 'vitest'
import { readFileSync } from 'node:fs'
import { join } from 'node:path'
import { geizhalsScraper } from '@/lib/scrapers/geizhals'
const fixture = readFileSync(join(__dirname, '../fixtures/geizhals-gpu.html'), 'utf-8')
beforeEach(() => {
global.fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
text: async () => fixture,
}) as unknown as typeof fetch
})
describe('geizhalsScraper', () => {
it('extracts price and name', async () => {
const r = await geizhalsScraper.scrape('https://geizhals.de/test')
expect(r.price).toBeGreaterThan(0)
expect(r.currency).toBe('EUR')
expect(r.name).toBeTruthy()
})
it('returns error on HTTP failure', async () => {
global.fetch = vi.fn().mockResolvedValue({
ok: false, status: 503, text: async () => '',
}) as unknown as typeof fetch
const r = await geizhalsScraper.scrape('https://geizhals.de/test')
expect(r.price).toBeNull()
expect(r.error).toMatch(/HTTP 503/)
})
})