Goondex/internal/scraper/registry.go
Stu Leak eb7e935f67 Phase 1 & 2: Complete browser automation and SugarInstant scraper implementation
## Phase 1: Browser Automation Infrastructure
- Added Chrome DevTools Protocol (CDP) dependency and client wrapper
- Created comprehensive browser automation package with age verification support
- Implemented browser-based scraper interface extending base scraper
- Added configuration system for browser automation settings
- Created browser client with XPath querying and HTML extraction
- Implemented site-specific configurations (SugarInstant, AdultEmpire)
- Added cookie management and age verification bypass
- Created comprehensive test suite for browser automation

## Phase 2: SugarInstant Scraper Implementation
- Converted 300+ lines of YAML XPath selectors to Go constants
- Implemented complete scene scraping with browser automation
- Implemented comprehensive performer scraping with data post-processing
- Created robust data post-processing utilities for dates, measurements, etc.
- Added search functionality interface ready for implementation
- Integrated scraper with Goondex models and browser automation
- Created extensive test coverage for all functionality
- Added command-line integration and configuration support

## Key Features
 Browser automation for JavaScript-heavy adult sites
 Age verification handling with multiple patterns
 XPath-based data extraction with comprehensive fallbacks
 Data post-processing for multiple formats and units
 Integration with Goondex scraper registry and models
 Configuration support and CLI integration
 Comprehensive testing and validation
 Production-ready architecture

Files added/modified:
- internal/browser/ (new package)
- internal/scraper/sugarinstant/ (new package)
- internal/config/browser.go (new)
- cmd/test-browser/ (new)
- cmd/test-sugarinstant/ (new)
- cmd/goondex/sugar.go (new)
- Updated main CLI integration
- Enhanced configuration system

Ready for Phase 3: Real-world testing and refinement.
2026-01-03 14:50:47 -05:00

124 lines
2.7 KiB
Go

package scraper
import (
"fmt"
"sync"
"git.leaktechnologies.dev/stu/Goondex/internal/browser"
)
// Registry manages available scrapers
type Registry struct {
mu sync.RWMutex
scrapers map[string]Scraper
browserScrapers map[string]BrowserScraper
browserClient *browser.Client
}
// NewRegistry creates a new scraper registry
func NewRegistry() *Registry {
return &Registry{
scrapers: make(map[string]Scraper),
browserScrapers: make(map[string]BrowserScraper),
}
}
// NewRegistryWithBrowser creates a new scraper registry with browser client
func NewRegistryWithBrowser(browserConfig *browser.Config) (*Registry, error) {
client, err := browser.NewClient(browserConfig)
if err != nil {
return nil, fmt.Errorf("failed to create browser client: %w", err)
}
return &Registry{
scrapers: make(map[string]Scraper),
browserScrapers: make(map[string]BrowserScraper),
browserClient: client,
}, nil
}
// Register adds a scraper to the registry
func (r *Registry) Register(s Scraper) error {
r.mu.Lock()
defer r.mu.Unlock()
name := s.Name()
if _, exists := r.scrapers[name]; exists {
return fmt.Errorf("scraper %q already registered", name)
}
r.scrapers[name] = s
// Also register as browser scraper if it implements the interface
if bs, ok := s.(BrowserScraper); ok {
r.browserScrapers[name] = bs
}
return nil
}
// Get retrieves a scraper by name
func (r *Registry) Get(name string) (Scraper, error) {
r.mu.RLock()
defer r.mu.RUnlock()
s, ok := r.scrapers[name]
if !ok {
return nil, fmt.Errorf("scraper %q not found", name)
}
return s, nil
}
// List returns all registered scraper names
func (r *Registry) List() []string {
r.mu.RLock()
defer r.mu.RUnlock()
names := make([]string, 0, len(r.scrapers))
for name := range r.scrapers {
names = append(names, name)
}
return names
}
// GetBrowserScraper retrieves a browser scraper by name
func (r *Registry) GetBrowserScraper(name string) (BrowserScraper, error) {
r.mu.RLock()
defer r.mu.RUnlock()
s, ok := r.browserScrapers[name]
if !ok {
return nil, fmt.Errorf("browser scraper %q not found", name)
}
return s, nil
}
// ListBrowserScrapers returns all registered browser scraper names
func (r *Registry) ListBrowserScrapers() []string {
r.mu.RLock()
defer r.mu.RUnlock()
names := make([]string, 0, len(r.browserScrapers))
for name := range r.browserScrapers {
names = append(names, name)
}
return names
}
// GetBrowserClient returns the browser client
func (r *Registry) GetBrowserClient() *browser.Client {
return r.browserClient
}
// Close closes the registry and releases resources
func (r *Registry) Close() error {
if r.browserClient != nil {
return r.browserClient.Close()
}
return nil
}