feat: amazon scraper with playwright + html parser tests

This commit is contained in:
2026-05-25 14:09:44 +00:00
parent f86055b85d
commit 98fc938f91
3 changed files with 7649 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
import * as cheerio from 'cheerio'
import type { PriceScraper, ScrapeResult } from './types'
export function parseAmazonHtml(html: string): ScrapeResult {
if (/validateCaptcha|api-services-support@amazon/i.test(html)) {
return { price: null, currency: 'EUR', availability: 'unknown', error: 'captcha' }
}
const $ = cheerio.load(html)
const priceTexts = [
$('#corePrice_feature_div .a-offscreen').first().text(),
$('#corePriceDisplay_desktop_feature_div .a-offscreen').first().text(),
$('span.priceToPay .a-offscreen').first().text(),
$('#priceblock_ourprice').first().text(),
$('#priceblock_dealprice').first().text(),
$('.a-price .a-offscreen').first().text(),
].filter(Boolean) as string[]
const price = parsePrice(priceTexts[0] ?? '')
const name = ($('#productTitle').text() || $('h1#title').text() || '').trim()
const imageUrl = $('#landingImage').attr('src')
|| $('#imgBlkFront').attr('src')
|| $('meta[property="og:image"]').attr('content')
|| undefined
const outOfStock = /derzeit nicht verf|currently unavailable/i.test($('#availability').text())
const availability = outOfStock ? 'out_of_stock' : (price !== null ? 'in_stock' : 'unknown')
if (price === null) {
return { price: null, currency: 'EUR', availability, name, imageUrl, error: 'price-selector-missed' }
}
return { price, currency: 'EUR', availability, name, imageUrl }
}
function parsePrice(text: string): number | null {
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
if (!cleaned) return null
const n = parseFloat(cleaned)
return Number.isFinite(n) && n > 0 ? n : null
}
export const amazonScraper: PriceScraper = {
shop: 'amazon',
async scrape(url: string): Promise<ScrapeResult> {
const { chromium } = await import('playwright')
const browser = await chromium.launch({ args: ['--no-sandbox', '--disable-dev-shm-usage'] })
try {
const ctx = await browser.newContext({
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
locale: 'de-DE',
extraHTTPHeaders: { 'Accept-Language': 'de-DE,de;q=0.9' },
})
const page = await ctx.newPage()
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30_000 })
const html = await page.content()
return parseAmazonHtml(html)
} catch (err) {
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
} finally {
await browser.close()
}
},
}

7563
tests/fixtures/amazon-ps5.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,23 @@
import { describe, it, expect } from 'vitest'
import { readFileSync } from 'node:fs'
import { join } from 'node:path'
import { parseAmazonHtml } from '@/lib/scrapers/amazon'
const fixture = readFileSync(join(__dirname, '../fixtures/amazon-ps5.html'), 'utf-8')
describe('parseAmazonHtml', () => {
it('extracts price, name, image', () => {
const r = parseAmazonHtml(fixture)
expect(r.price).toBeGreaterThan(0)
expect(r.currency).toBe('EUR')
expect(r.name).toBeTruthy()
expect(r.imageUrl).toMatch(/^https?:\/\//)
})
it('detects captcha page', () => {
const captchaHtml = '<html><body><form action="/errors/validateCaptcha"></form></body></html>'
const r = parseAmazonHtml(captchaHtml)
expect(r.price).toBeNull()
expect(r.error).toBe('captcha')
})
})