- Implement full web interface with Go html/template server - Add GX component library (buttons, dialogs, tables, forms, etc.) - Create scene/performer/studio/movie detail and listing pages - Add Adult Empire scraper for additional metadata sources - Implement movie support with database schema - Add import and sync services for data management - Include comprehensive API and frontend documentation - Add custom color scheme and responsive layout 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
186 lines
4.3 KiB
Go
186 lines
4.3 KiB
Go
package adultemp
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/antchfx/htmlquery"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// XPathParser handles XPath parsing of Adult Empire pages
|
|
type XPathParser struct {
|
|
doc *html.Node
|
|
}
|
|
|
|
// NewXPathParser creates a new XPath parser from HTML bytes
|
|
func NewXPathParser(htmlContent []byte) (*XPathParser, error) {
|
|
doc, err := htmlquery.Parse(bytes.NewReader(htmlContent))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
|
}
|
|
|
|
return &XPathParser{doc: doc}, nil
|
|
}
|
|
|
|
// QueryString extracts a single string value using XPath
|
|
func (p *XPathParser) QueryString(xpath string) string {
|
|
node := htmlquery.FindOne(p.doc, xpath)
|
|
if node == nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(htmlquery.InnerText(node))
|
|
}
|
|
|
|
// QueryAttr extracts an attribute value using XPath
|
|
func (p *XPathParser) QueryAttr(xpath, attr string) string {
|
|
node := htmlquery.FindOne(p.doc, xpath)
|
|
if node == nil {
|
|
return ""
|
|
}
|
|
for _, a := range node.Attr {
|
|
if a.Key == attr {
|
|
return strings.TrimSpace(a.Val)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// QueryStrings extracts multiple string values using XPath
|
|
func (p *XPathParser) QueryStrings(xpath string) []string {
|
|
nodes := htmlquery.Find(p.doc, xpath)
|
|
var results []string
|
|
for _, node := range nodes {
|
|
text := strings.TrimSpace(htmlquery.InnerText(node))
|
|
if text != "" {
|
|
results = append(results, text)
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
// QueryAttrs extracts multiple attribute values using XPath
|
|
func (p *XPathParser) QueryAttrs(xpath, attr string) []string {
|
|
nodes := htmlquery.Find(p.doc, xpath)
|
|
var results []string
|
|
for _, node := range nodes {
|
|
for _, a := range node.Attr {
|
|
if a.Key == attr {
|
|
val := strings.TrimSpace(a.Val)
|
|
if val != "" {
|
|
results = append(results, val)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
// Helper functions for common parsing tasks
|
|
|
|
// ParseDate converts various date formats to YYYY-MM-DD
|
|
func ParseDate(dateStr string) string {
|
|
dateStr = strings.TrimSpace(dateStr)
|
|
if dateStr == "" {
|
|
return ""
|
|
}
|
|
|
|
// Try to extract date in various formats
|
|
// Format: "Jan 02, 2006" -> "2006-01-02"
|
|
// Format: "2006-01-02" -> "2006-01-02"
|
|
|
|
// If already in YYYY-MM-DD format
|
|
if matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}$`, dateStr); matched {
|
|
return dateStr
|
|
}
|
|
|
|
// Common Adult Empire format: "Jan 02, 2006"
|
|
// We'll return it as-is for now and let the caller handle conversion
|
|
return dateStr
|
|
}
|
|
|
|
// ParseHeight converts height strings to centimeters
|
|
// Example: "5'6\"" -> "168"
|
|
func ParseHeight(heightStr string) int {
|
|
heightStr = strings.TrimSpace(heightStr)
|
|
if heightStr == "" {
|
|
return 0
|
|
}
|
|
|
|
// Parse feet and inches
|
|
re := regexp.MustCompile(`(\d+)'(\d+)"?`)
|
|
matches := re.FindStringSubmatch(heightStr)
|
|
if len(matches) == 3 {
|
|
feet, _ := strconv.Atoi(matches[1])
|
|
inches, _ := strconv.Atoi(matches[2])
|
|
totalInches := feet*12 + inches
|
|
cm := int(float64(totalInches) * 2.54)
|
|
return cm
|
|
}
|
|
|
|
// Try to extract just a number with "cm"
|
|
if strings.Contains(heightStr, "cm") {
|
|
re = regexp.MustCompile(`(\d+)`)
|
|
matches = re.FindStringSubmatch(heightStr)
|
|
if len(matches) > 0 {
|
|
cm, _ := strconv.Atoi(matches[0])
|
|
return cm
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// CleanText removes "Show More/Less" text and extra whitespace
|
|
func CleanText(text string) string {
|
|
text = strings.TrimSpace(text)
|
|
|
|
// Remove "Show More" / "Show Less" buttons
|
|
text = regexp.MustCompile(`(?i)show\s+(more|less)`).ReplaceAllString(text, "")
|
|
|
|
// Remove extra whitespace
|
|
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
|
|
|
return strings.TrimSpace(text)
|
|
}
|
|
|
|
// ExtractURL ensures a URL is complete
|
|
func ExtractURL(rawURL, baseURL string) string {
|
|
rawURL = strings.TrimSpace(rawURL)
|
|
if rawURL == "" {
|
|
return ""
|
|
}
|
|
|
|
// If it's already a full URL
|
|
if strings.HasPrefix(rawURL, "http://") || strings.HasPrefix(rawURL, "https://") {
|
|
return rawURL
|
|
}
|
|
|
|
// If it starts with //, add https:
|
|
if strings.HasPrefix(rawURL, "//") {
|
|
return "https:" + rawURL
|
|
}
|
|
|
|
// If it's a relative path, prepend base URL
|
|
if strings.HasPrefix(rawURL, "/") {
|
|
return baseURL + rawURL
|
|
}
|
|
|
|
return rawURL
|
|
}
|
|
|
|
// ExtractID extracts numeric ID from URL
|
|
// Example: "/123456/scene-name" -> "123456"
|
|
func ExtractID(urlPath string) string {
|
|
re := regexp.MustCompile(`/(\d+)/`)
|
|
matches := re.FindStringSubmatch(urlPath)
|
|
if len(matches) > 1 {
|
|
return matches[1]
|
|
}
|
|
return ""
|
|
}
|