feat: geizhals scraper with cheerio + tests
This commit is contained in:
48
src/lib/scrapers/geizhals.ts
Normal file
48
src/lib/scrapers/geizhals.ts
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import * as cheerio from 'cheerio'
|
||||||
|
import type { PriceScraper, ScrapeResult } from './types'
|
||||||
|
|
||||||
|
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
|
||||||
|
|
||||||
|
export const geizhalsScraper: PriceScraper = {
|
||||||
|
shop: 'geizhals',
|
||||||
|
async scrape(url: string): Promise<ScrapeResult> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, {
|
||||||
|
headers: { 'User-Agent': UA, 'Accept-Language': 'de-DE,de;q=0.9' },
|
||||||
|
signal: AbortSignal.timeout(20_000),
|
||||||
|
})
|
||||||
|
if (!res.ok) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', error: `HTTP ${res.status}` }
|
||||||
|
}
|
||||||
|
const $ = cheerio.load(await res.text())
|
||||||
|
|
||||||
|
const priceTexts = [
|
||||||
|
$('.gh_price').first().text(),
|
||||||
|
$('span.gh_price strong').first().text(),
|
||||||
|
$('[itemprop="price"]').attr('content'),
|
||||||
|
$('meta[itemprop="price"]').attr('content'),
|
||||||
|
].filter(Boolean) as string[]
|
||||||
|
|
||||||
|
const price = parsePrice(priceTexts[0] ?? '')
|
||||||
|
const name = ($('h1[itemprop="name"]').text() || $('h1').first().text() || '').trim()
|
||||||
|
const imageUrl = $('img.product-gallery__image').first().attr('src')
|
||||||
|
|| $('meta[property="og:image"]').attr('content')
|
||||||
|
|| undefined
|
||||||
|
|
||||||
|
if (price === null) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', name, imageUrl, error: 'price-selector-missed' }
|
||||||
|
}
|
||||||
|
|
||||||
|
return { price, currency: 'EUR', availability: 'in_stock', name, imageUrl }
|
||||||
|
} catch (err) {
|
||||||
|
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
function parsePrice(text: string): number | null {
|
||||||
|
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
|
||||||
|
if (!cleaned) return null
|
||||||
|
const n = parseFloat(cleaned)
|
||||||
|
return Number.isFinite(n) && n > 0 ? n : null
|
||||||
|
}
|
||||||
3757
tests/fixtures/geizhals-gpu.html
vendored
Normal file
3757
tests/fixtures/geizhals-gpu.html
vendored
Normal file
File diff suppressed because one or more lines are too long
32
tests/scrapers/geizhals.test.ts
Normal file
32
tests/scrapers/geizhals.test.ts
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
import { describe, it, expect, vi, beforeEach } from 'vitest'
|
||||||
|
import { readFileSync } from 'node:fs'
|
||||||
|
import { join } from 'node:path'
|
||||||
|
import { geizhalsScraper } from '@/lib/scrapers/geizhals'
|
||||||
|
|
||||||
|
const fixture = readFileSync(join(__dirname, '../fixtures/geizhals-gpu.html'), 'utf-8')
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
global.fetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => fixture,
|
||||||
|
}) as unknown as typeof fetch
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('geizhalsScraper', () => {
|
||||||
|
it('extracts price and name', async () => {
|
||||||
|
const r = await geizhalsScraper.scrape('https://geizhals.de/test')
|
||||||
|
expect(r.price).toBeGreaterThan(0)
|
||||||
|
expect(r.currency).toBe('EUR')
|
||||||
|
expect(r.name).toBeTruthy()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns error on HTTP failure', async () => {
|
||||||
|
global.fetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: false, status: 503, text: async () => '',
|
||||||
|
}) as unknown as typeof fetch
|
||||||
|
const r = await geizhalsScraper.scrape('https://geizhals.de/test')
|
||||||
|
expect(r.price).toBeNull()
|
||||||
|
expect(r.error).toMatch(/HTTP 503/)
|
||||||
|
})
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user