feat: amazon scraper with playwright + html parser tests
This commit is contained in:
63
src/lib/scrapers/amazon.ts
Normal file
63
src/lib/scrapers/amazon.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
import * as cheerio from 'cheerio'
|
||||
import type { PriceScraper, ScrapeResult } from './types'
|
||||
|
||||
export function parseAmazonHtml(html: string): ScrapeResult {
|
||||
if (/validateCaptcha|api-services-support@amazon/i.test(html)) {
|
||||
return { price: null, currency: 'EUR', availability: 'unknown', error: 'captcha' }
|
||||
}
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const priceTexts = [
|
||||
$('#corePrice_feature_div .a-offscreen').first().text(),
|
||||
$('#corePriceDisplay_desktop_feature_div .a-offscreen').first().text(),
|
||||
$('span.priceToPay .a-offscreen').first().text(),
|
||||
$('#priceblock_ourprice').first().text(),
|
||||
$('#priceblock_dealprice').first().text(),
|
||||
$('.a-price .a-offscreen').first().text(),
|
||||
].filter(Boolean) as string[]
|
||||
|
||||
const price = parsePrice(priceTexts[0] ?? '')
|
||||
const name = ($('#productTitle').text() || $('h1#title').text() || '').trim()
|
||||
const imageUrl = $('#landingImage').attr('src')
|
||||
|| $('#imgBlkFront').attr('src')
|
||||
|| $('meta[property="og:image"]').attr('content')
|
||||
|| undefined
|
||||
|
||||
const outOfStock = /derzeit nicht verf|currently unavailable/i.test($('#availability').text())
|
||||
const availability = outOfStock ? 'out_of_stock' : (price !== null ? 'in_stock' : 'unknown')
|
||||
|
||||
if (price === null) {
|
||||
return { price: null, currency: 'EUR', availability, name, imageUrl, error: 'price-selector-missed' }
|
||||
}
|
||||
return { price, currency: 'EUR', availability, name, imageUrl }
|
||||
}
|
||||
|
||||
function parsePrice(text: string): number | null {
|
||||
const cleaned = text.replace(/[^\d.,]/g, '').replace(/\.(?=\d{3}(\D|$))/g, '').replace(',', '.')
|
||||
if (!cleaned) return null
|
||||
const n = parseFloat(cleaned)
|
||||
return Number.isFinite(n) && n > 0 ? n : null
|
||||
}
|
||||
|
||||
export const amazonScraper: PriceScraper = {
|
||||
shop: 'amazon',
|
||||
async scrape(url: string): Promise<ScrapeResult> {
|
||||
const { chromium } = await import('playwright')
|
||||
const browser = await chromium.launch({ args: ['--no-sandbox', '--disable-dev-shm-usage'] })
|
||||
try {
|
||||
const ctx = await browser.newContext({
|
||||
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
|
||||
locale: 'de-DE',
|
||||
extraHTTPHeaders: { 'Accept-Language': 'de-DE,de;q=0.9' },
|
||||
})
|
||||
const page = await ctx.newPage()
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30_000 })
|
||||
const html = await page.content()
|
||||
return parseAmazonHtml(html)
|
||||
} catch (err) {
|
||||
return { price: null, currency: 'EUR', availability: 'unknown', error: (err as Error).message }
|
||||
} finally {
|
||||
await browser.close()
|
||||
}
|
||||
},
|
||||
}
|
||||
7563
tests/fixtures/amazon-ps5.html
vendored
Normal file
7563
tests/fixtures/amazon-ps5.html
vendored
Normal file
File diff suppressed because one or more lines are too long
23
tests/scrapers/amazon.test.ts
Normal file
23
tests/scrapers/amazon.test.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { readFileSync } from 'node:fs'
|
||||
import { join } from 'node:path'
|
||||
import { parseAmazonHtml } from '@/lib/scrapers/amazon'
|
||||
|
||||
const fixture = readFileSync(join(__dirname, '../fixtures/amazon-ps5.html'), 'utf-8')
|
||||
|
||||
describe('parseAmazonHtml', () => {
|
||||
it('extracts price, name, image', () => {
|
||||
const r = parseAmazonHtml(fixture)
|
||||
expect(r.price).toBeGreaterThan(0)
|
||||
expect(r.currency).toBe('EUR')
|
||||
expect(r.name).toBeTruthy()
|
||||
expect(r.imageUrl).toMatch(/^https?:\/\//)
|
||||
})
|
||||
|
||||
it('detects captcha page', () => {
|
||||
const captchaHtml = '<html><body><form action="/errors/validateCaptcha"></form></body></html>'
|
||||
const r = parseAmazonHtml(captchaHtml)
|
||||
expect(r.price).toBeNull()
|
||||
expect(r.error).toBe('captcha')
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user