## Phase 1: Browser Automation Infrastructure - Added Chrome DevTools Protocol (CDP) dependency and client wrapper - Created comprehensive browser automation package with age verification support - Implemented browser-based scraper interface extending base scraper - Added configuration system for browser automation settings - Created browser client with XPath querying and HTML extraction - Implemented site-specific configurations (SugarInstant, AdultEmpire) - Added cookie management and age verification bypass - Created comprehensive test suite for browser automation ## Phase 2: SugarInstant Scraper Implementation - Converted 300+ lines of YAML XPath selectors to Go constants - Implemented complete scene scraping with browser automation - Implemented comprehensive performer scraping with data post-processing - Created robust data post-processing utilities for dates, measurements, etc. - Added search functionality interface ready for implementation - Integrated scraper with Goondex models and browser automation - Created extensive test coverage for all functionality - Added command-line integration and configuration support ## Key Features ✅ Browser automation for JavaScript-heavy adult sites ✅ Age verification handling with multiple patterns ✅ XPath-based data extraction with comprehensive fallbacks ✅ Data post-processing for multiple formats and units ✅ Integration with Goondex scraper registry and models ✅ Configuration support and CLI integration ✅ Comprehensive testing and validation ✅ Production-ready architecture Files added/modified: - internal/browser/ (new package) - internal/scraper/sugarinstant/ (new package) - internal/config/browser.go (new) - cmd/test-browser/ (new) - cmd/test-sugarinstant/ (new) - cmd/goondex/sugar.go (new) - Updated main CLI integration - Enhanced configuration system Ready for Phase 3: Real-world testing and refinement.
124 lines
2.7 KiB
Go
124 lines
2.7 KiB
Go
package scraper
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
|
|
"git.leaktechnologies.dev/stu/Goondex/internal/browser"
|
|
)
|
|
|
|
// Registry manages available scrapers
|
|
type Registry struct {
|
|
mu sync.RWMutex
|
|
scrapers map[string]Scraper
|
|
browserScrapers map[string]BrowserScraper
|
|
browserClient *browser.Client
|
|
}
|
|
|
|
// NewRegistry creates a new scraper registry
|
|
func NewRegistry() *Registry {
|
|
return &Registry{
|
|
scrapers: make(map[string]Scraper),
|
|
browserScrapers: make(map[string]BrowserScraper),
|
|
}
|
|
}
|
|
|
|
// NewRegistryWithBrowser creates a new scraper registry with browser client
|
|
func NewRegistryWithBrowser(browserConfig *browser.Config) (*Registry, error) {
|
|
client, err := browser.NewClient(browserConfig)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create browser client: %w", err)
|
|
}
|
|
|
|
return &Registry{
|
|
scrapers: make(map[string]Scraper),
|
|
browserScrapers: make(map[string]BrowserScraper),
|
|
browserClient: client,
|
|
}, nil
|
|
}
|
|
|
|
// Register adds a scraper to the registry
|
|
func (r *Registry) Register(s Scraper) error {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
name := s.Name()
|
|
if _, exists := r.scrapers[name]; exists {
|
|
return fmt.Errorf("scraper %q already registered", name)
|
|
}
|
|
|
|
r.scrapers[name] = s
|
|
|
|
// Also register as browser scraper if it implements the interface
|
|
if bs, ok := s.(BrowserScraper); ok {
|
|
r.browserScrapers[name] = bs
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Get retrieves a scraper by name
|
|
func (r *Registry) Get(name string) (Scraper, error) {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
|
|
s, ok := r.scrapers[name]
|
|
if !ok {
|
|
return nil, fmt.Errorf("scraper %q not found", name)
|
|
}
|
|
|
|
return s, nil
|
|
}
|
|
|
|
// List returns all registered scraper names
|
|
func (r *Registry) List() []string {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
|
|
names := make([]string, 0, len(r.scrapers))
|
|
for name := range r.scrapers {
|
|
names = append(names, name)
|
|
}
|
|
|
|
return names
|
|
}
|
|
|
|
// GetBrowserScraper retrieves a browser scraper by name
|
|
func (r *Registry) GetBrowserScraper(name string) (BrowserScraper, error) {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
|
|
s, ok := r.browserScrapers[name]
|
|
if !ok {
|
|
return nil, fmt.Errorf("browser scraper %q not found", name)
|
|
}
|
|
|
|
return s, nil
|
|
}
|
|
|
|
// ListBrowserScrapers returns all registered browser scraper names
|
|
func (r *Registry) ListBrowserScrapers() []string {
|
|
r.mu.RLock()
|
|
defer r.mu.RUnlock()
|
|
|
|
names := make([]string, 0, len(r.browserScrapers))
|
|
for name := range r.browserScrapers {
|
|
names = append(names, name)
|
|
}
|
|
|
|
return names
|
|
}
|
|
|
|
// GetBrowserClient returns the browser client
|
|
func (r *Registry) GetBrowserClient() *browser.Client {
|
|
return r.browserClient
|
|
}
|
|
|
|
// Close closes the registry and releases resources
|
|
func (r *Registry) Close() error {
|
|
if r.browserClient != nil {
|
|
return r.browserClient.Close()
|
|
}
|
|
return nil
|
|
}
|