Goondex/internal/scraper/adultemp/scraper.go
Stu Leak 16fb407a3c v0.1.0-dev4: Add web frontend with UI component library
- Implement full web interface with Go html/template server
- Add GX component library (buttons, dialogs, tables, forms, etc.)
- Create scene/performer/studio/movie detail and listing pages
- Add Adult Empire scraper for additional metadata sources
- Implement movie support with database schema
- Add import and sync services for data management
- Include comprehensive API and frontend documentation
- Add custom color scheme and responsive layout

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 10:47:30 -05:00

310 lines
7.8 KiB
Go

package adultemp
import (
"context"
"fmt"
"strings"
"git.leaktechnologies.dev/stu/Goondex/internal/model"
)
// Scraper implements Adult Empire scraping functionality
type Scraper struct {
client *Client
}
// NewScraper creates a new Adult Empire scraper
func NewScraper() (*Scraper, error) {
client, err := NewClient()
if err != nil {
return nil, err
}
return &Scraper{
client: client,
}, nil
}
// SetAuthToken sets the authentication token for the scraper
func (s *Scraper) SetAuthToken(etoken string) error {
return s.client.SetAuthToken(etoken)
}
// ScrapeSceneByURL scrapes a scene from its Adult Empire URL
func (s *Scraper) ScrapeSceneByURL(ctx context.Context, url string) (*SceneData, error) {
html, err := s.client.GetSceneByURL(ctx, url)
if err != nil {
return nil, fmt.Errorf("failed to fetch scene: %w", err)
}
parser, err := NewXPathParser(html)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
scene := &SceneData{
URL: url,
}
// Extract title
scene.Title = parser.QueryString("//h1[@class='title']")
// Extract date
dateStr := parser.QueryString("//div[@class='release-date']/text()")
scene.Date = ParseDate(dateStr)
// Extract studio
scene.Studio = parser.QueryString("//a[contains(@href, '/studio/')]/text()")
// Extract cover image
scene.Image = ExtractURL(
parser.QueryAttr("//div[@class='item-image']//img", "src"),
s.client.baseURL,
)
// Extract description
desc := parser.QueryString("//div[@class='synopsis']")
scene.Description = CleanText(desc)
// Extract performers
scene.Performers = parser.QueryStrings("//a[contains(@href, '/performer/')]/text()")
// Extract tags/categories
scene.Tags = parser.QueryStrings("//a[contains(@href, '/category/')]/text()")
// Extract code/SKU
scene.Code = parser.QueryString("//span[@class='sku']/text()")
// Extract director
scene.Director = parser.QueryString("//a[contains(@href, '/director/')]/text()")
return scene, nil
}
// SearchScenesByName searches for scenes by title
func (s *Scraper) SearchScenesByName(ctx context.Context, query string) ([]SearchResult, error) {
html, err := s.client.SearchScenes(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to search scenes: %w", err)
}
parser, err := NewXPathParser(html)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
var results []SearchResult
// Extract search result items using official Stash scraper XPath
// Title: //a[@class="boxcover"]/img/@title
// URL: //a[@class="boxcover"]/@href
// Image: //a[@class="boxcover"]/img/@src
titles := parser.QueryAttrs("//a[@class='boxcover']/img", "title")
urls := parser.QueryAttrs("//a[@class='boxcover']", "href")
images := parser.QueryAttrs("//a[@class='boxcover']/img", "src")
for i := range titles {
result := SearchResult{
Title: titles[i],
}
if i < len(urls) {
result.URL = ExtractURL(urls[i], s.client.baseURL)
}
if i < len(images) {
result.Image = ExtractURL(images[i], s.client.baseURL)
}
results = append(results, result)
}
return results, nil
}
// ScrapePerformerByURL scrapes a performer from their Adult Empire URL
func (s *Scraper) ScrapePerformerByURL(ctx context.Context, url string) (*PerformerData, error) {
html, err := s.client.GetPerformerByURL(ctx, url)
if err != nil {
return nil, fmt.Errorf("failed to fetch performer: %w", err)
}
parser, err := NewXPathParser(html)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
performer := &PerformerData{
URL: url,
}
// Extract name
performer.Name = parser.QueryString("//h1[@class='performer-name']")
// Extract image
performer.Image = ExtractURL(
parser.QueryAttr("//div[@class='performer-image']//img", "src"),
s.client.baseURL,
)
// Extract birthdate
performer.Birthdate = parser.QueryString("//span[@class='birthdate']/text()")
// Extract ethnicity
performer.Ethnicity = parser.QueryString("//span[@class='ethnicity']/text()")
// Extract country
performer.Country = parser.QueryString("//span[@class='country']/text()")
// Extract height
heightStr := parser.QueryString("//span[@class='height']/text()")
if heightStr != "" {
height := ParseHeight(heightStr)
if height > 0 {
performer.Height = fmt.Sprintf("%d cm", height)
}
}
// Extract measurements
performer.Measurements = parser.QueryString("//span[@class='measurements']/text()")
// Extract hair color
performer.HairColor = parser.QueryString("//span[@class='hair-color']/text()")
// Extract eye color
performer.EyeColor = parser.QueryString("//span[@class='eye-color']/text()")
// Extract biography
bio := parser.QueryString("//div[@class='bio']")
performer.Biography = CleanText(bio)
// Extract aliases
aliasStr := parser.QueryString("//span[@class='aliases']/text()")
if aliasStr != "" {
// Split by comma
for _, alias := range splitByComma(aliasStr) {
performer.Aliases = append(performer.Aliases, alias)
}
}
return performer, nil
}
// SearchPerformersByName searches for performers by name
func (s *Scraper) SearchPerformersByName(ctx context.Context, name string) ([]SearchResult, error) {
html, err := s.client.SearchPerformers(ctx, name)
if err != nil {
return nil, fmt.Errorf("failed to search performers: %w", err)
}
parser, err := NewXPathParser(html)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
var results []SearchResult
// Extract performer search results using official Stash scraper XPath
// Root: //div[@id="performerlist"]//a
// Name: @label attribute
// URL: @href attribute
names := parser.QueryAttrs("//div[@id='performerlist']//a", "label")
urls := parser.QueryAttrs("//div[@id='performerlist']//a", "href")
images := parser.QueryAttrs("//div[@id='performerlist']//a//img", "src")
for i := range names {
result := SearchResult{
Title: names[i],
}
if i < len(urls) {
result.URL = ExtractURL(urls[i], s.client.baseURL)
}
if i < len(images) {
result.Image = ExtractURL(images[i], s.client.baseURL)
}
results = append(results, result)
}
return results, nil
}
// ConvertSceneToModel converts SceneData to Goondex model.Scene
func (s *Scraper) ConvertSceneToModel(data *SceneData) *model.Scene {
scene := &model.Scene{
Title: data.Title,
URL: data.URL,
Date: data.Date,
Description: data.Description,
ImageURL: data.Image,
Code: data.Code,
Director: data.Director,
Source: "adultemp",
SourceID: ExtractID(data.URL),
}
// Studio will need to be looked up/created separately
// Performers will need to be looked up/created separately
// Tags will need to be looked up/created separately
return scene
}
// ConvertPerformerToModel converts PerformerData to Goondex model.Performer
func (s *Scraper) ConvertPerformerToModel(data *PerformerData) *model.Performer {
performer := &model.Performer{
Name: data.Name,
ImageURL: data.Image,
Birthday: data.Birthdate,
Ethnicity: data.Ethnicity,
Country: data.Country,
Measurements: data.Measurements,
HairColor: data.HairColor,
EyeColor: data.EyeColor,
Bio: data.Biography,
Source: "adultemp",
SourceID: ExtractID(data.URL),
}
// Parse height if available
if data.Height != "" {
height := ParseHeight(data.Height)
if height > 0 {
performer.Height = height
}
}
// Join aliases
if len(data.Aliases) > 0 {
performer.Aliases = joinStrings(data.Aliases, ", ")
}
return performer
}
// Helper functions
func splitByComma(s string) []string {
var result []string
parts := strings.Split(s, ",")
for _, part := range parts {
trimmed := strings.TrimSpace(part)
if trimmed != "" {
result = append(result, trimmed)
}
}
return result
}
func joinStrings(strs []string, sep string) string {
var nonEmpty []string
for _, s := range strs {
if s != "" {
nonEmpty = append(nonEmpty, s)
}
}
return strings.Join(nonEmpty, sep)
}