Goondex/internal/scraper/adultemp/xpath.go
Stu Leak 16fb407a3c v0.1.0-dev4: Add web frontend with UI component library
- Implement full web interface with Go html/template server
- Add GX component library (buttons, dialogs, tables, forms, etc.)
- Create scene/performer/studio/movie detail and listing pages
- Add Adult Empire scraper for additional metadata sources
- Implement movie support with database schema
- Add import and sync services for data management
- Include comprehensive API and frontend documentation
- Add custom color scheme and responsive layout

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 10:47:30 -05:00

186 lines
4.3 KiB
Go

package adultemp
import (
"bytes"
"fmt"
"regexp"
"strconv"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
)
// XPathParser handles XPath parsing of Adult Empire pages
type XPathParser struct {
doc *html.Node
}
// NewXPathParser creates a new XPath parser from HTML bytes
func NewXPathParser(htmlContent []byte) (*XPathParser, error) {
doc, err := htmlquery.Parse(bytes.NewReader(htmlContent))
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
return &XPathParser{doc: doc}, nil
}
// QueryString extracts a single string value using XPath
func (p *XPathParser) QueryString(xpath string) string {
node := htmlquery.FindOne(p.doc, xpath)
if node == nil {
return ""
}
return strings.TrimSpace(htmlquery.InnerText(node))
}
// QueryAttr extracts an attribute value using XPath
func (p *XPathParser) QueryAttr(xpath, attr string) string {
node := htmlquery.FindOne(p.doc, xpath)
if node == nil {
return ""
}
for _, a := range node.Attr {
if a.Key == attr {
return strings.TrimSpace(a.Val)
}
}
return ""
}
// QueryStrings extracts multiple string values using XPath
func (p *XPathParser) QueryStrings(xpath string) []string {
nodes := htmlquery.Find(p.doc, xpath)
var results []string
for _, node := range nodes {
text := strings.TrimSpace(htmlquery.InnerText(node))
if text != "" {
results = append(results, text)
}
}
return results
}
// QueryAttrs extracts multiple attribute values using XPath
func (p *XPathParser) QueryAttrs(xpath, attr string) []string {
nodes := htmlquery.Find(p.doc, xpath)
var results []string
for _, node := range nodes {
for _, a := range node.Attr {
if a.Key == attr {
val := strings.TrimSpace(a.Val)
if val != "" {
results = append(results, val)
}
break
}
}
}
return results
}
// Helper functions for common parsing tasks
// ParseDate converts various date formats to YYYY-MM-DD
func ParseDate(dateStr string) string {
dateStr = strings.TrimSpace(dateStr)
if dateStr == "" {
return ""
}
// Try to extract date in various formats
// Format: "Jan 02, 2006" -> "2006-01-02"
// Format: "2006-01-02" -> "2006-01-02"
// If already in YYYY-MM-DD format
if matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}$`, dateStr); matched {
return dateStr
}
// Common Adult Empire format: "Jan 02, 2006"
// We'll return it as-is for now and let the caller handle conversion
return dateStr
}
// ParseHeight converts height strings to centimeters
// Example: "5'6\"" -> "168"
func ParseHeight(heightStr string) int {
heightStr = strings.TrimSpace(heightStr)
if heightStr == "" {
return 0
}
// Parse feet and inches
re := regexp.MustCompile(`(\d+)'(\d+)"?`)
matches := re.FindStringSubmatch(heightStr)
if len(matches) == 3 {
feet, _ := strconv.Atoi(matches[1])
inches, _ := strconv.Atoi(matches[2])
totalInches := feet*12 + inches
cm := int(float64(totalInches) * 2.54)
return cm
}
// Try to extract just a number with "cm"
if strings.Contains(heightStr, "cm") {
re = regexp.MustCompile(`(\d+)`)
matches = re.FindStringSubmatch(heightStr)
if len(matches) > 0 {
cm, _ := strconv.Atoi(matches[0])
return cm
}
}
return 0
}
// CleanText removes "Show More/Less" text and extra whitespace
func CleanText(text string) string {
text = strings.TrimSpace(text)
// Remove "Show More" / "Show Less" buttons
text = regexp.MustCompile(`(?i)show\s+(more|less)`).ReplaceAllString(text, "")
// Remove extra whitespace
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
return strings.TrimSpace(text)
}
// ExtractURL ensures a URL is complete
func ExtractURL(rawURL, baseURL string) string {
rawURL = strings.TrimSpace(rawURL)
if rawURL == "" {
return ""
}
// If it's already a full URL
if strings.HasPrefix(rawURL, "http://") || strings.HasPrefix(rawURL, "https://") {
return rawURL
}
// If it starts with //, add https:
if strings.HasPrefix(rawURL, "//") {
return "https:" + rawURL
}
// If it's a relative path, prepend base URL
if strings.HasPrefix(rawURL, "/") {
return baseURL + rawURL
}
return rawURL
}
// ExtractID extracts numeric ID from URL
// Example: "/123456/scene-name" -> "123456"
func ExtractID(urlPath string) string {
re := regexp.MustCompile(`/(\d+)/`)
matches := re.FindStringSubmatch(urlPath)
if len(matches) > 1 {
return matches[1]
}
return ""
}