fix hero slider

pull/49/head
Broque Thomas 8 months ago
parent bdb087f8d5
commit 0d54e4a62a

@ -1680,7 +1680,7 @@ class BeatportUnifiedScraper:
return None
def scrape_new_on_beatport_hero(self, limit: int = 10) -> List[Dict]:
"""Scrape the 'New on Beatport' hero slideshow from homepage"""
"""Scrape the 'New on Beatport' hero slideshow from homepage using data-testid standard"""
print("\n🎯 Scraping 'New on Beatport' hero slideshow...")
soup = self.get_page(self.base_url)
@ -1689,17 +1689,27 @@ class BeatportUnifiedScraper:
tracks = []
# Method 1: Look for the specific wrapper class you mentioned
hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ')
if hero_wrapper:
print(" ✅ Found Homepage NewOnBeatportWrapper")
tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit))
# Method 1 (PRIMARY): Use data-testid standard like all other rebuild functions
hero_items = soup.select('[data-testid="new-on-beatport"]')
if hero_items:
print(f" ✅ Found {len(hero_items)} items using data-testid='new-on-beatport'")
for i, item in enumerate(hero_items[:limit]):
track_data = self._extract_track_from_slide(item, f"Hero Item {i+1}")
if track_data and track_data.get('url'):
tracks.append(track_data)
# Method 2 (FALLBACK): Look for the specific wrapper class (legacy support)
if len(tracks) < 5:
hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ')
if hero_wrapper:
print(" ✅ Found Homepage NewOnBeatportWrapper (fallback)")
tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit))
# Method 2: Look for carousel with aria attributes you mentioned
if len(tracks) < 5: # Only try if we don't have enough tracks
# Method 3 (FALLBACK): Look for carousel with aria attributes
if len(tracks) < 5:
carousel = soup.find('div', {'aria-roledescription': 'carousel', 'aria-label': 'Carousel'})
if carousel:
print(" ✅ Found carousel with aria-roledescription and aria-label")
print(" ✅ Found carousel with aria-roledescription and aria-label (fallback)")
additional_tracks = self._extract_from_carousel(carousel, limit)
# Merge without duplicates
existing_urls = {track.get('url') for track in tracks}
@ -1707,9 +1717,9 @@ class BeatportUnifiedScraper:
if track.get('url') not in existing_urls:
tracks.append(track)
# Method 3: Look for individual slide items more broadly
# Method 4 (LAST RESORT): Look for individual slide items more broadly
if len(tracks) < 5:
print(" 🔍 Looking for individual carousel items...")
print(" 🔍 Looking for individual carousel items (last resort)...")
carousel_items = soup.find_all(['div', 'article'], class_=re.compile(r'carousel.*item|item.*carousel|slide', re.I))
print(f" Found {len(carousel_items)} potential carousel items")
@ -1890,20 +1900,31 @@ class BeatportUnifiedScraper:
# Apply final cleaning to all extracted data
if track_data.get('title'):
track_data['title'] = self._clean_title(track_data['title'])
track_data['title'] = self.clean_beatport_text(self._clean_title(track_data['title']))
if track_data.get('artist'):
track_data['artist'] = self._clean_artist(track_data['artist'])
track_data['artist'] = self.clean_beatport_text(self._clean_artist(track_data['artist']))
# Extract all class names for debugging
classes = slide.get('class', [])
if classes:
track_data['element_classes'] = ' '.join(classes)
# Only return if we found at least some useful data
if track_data.get('title') or track_data.get('artist') or track_data.get('url') or track_data.get('image_url'):
# Filter out empty/invalid tracks
title = track_data.get('title', '').strip()
artist = track_data.get('artist', '').strip()
# Skip tracks with no title/artist or generic values
if (not title or not artist or
title.lower() in ['no title', 'unknown title', 'unknown', ''] or
artist.lower() in ['no artist', 'unknown artist', 'unknown', 'various artists', '']):
print(f"{context}: Filtered out invalid track - '{title}' by '{artist}'")
return None
# Only return if we found meaningful data
if track_data.get('url') or track_data.get('image_url'):
track_data['source'] = f"New on Beatport Hero - {context}"
track_data['scraped_at'] = time.time()
print(f"{context}: {track_data.get('title', 'No title')} - {track_data.get('artist', 'No artist')}")
print(f"{context}: {title} - {artist}")
return track_data
else:
print(f"{context}: No usable data found")
@ -2143,6 +2164,20 @@ class BeatportUnifiedScraper:
return ' '.join(cleaned_words)
return artist
def clean_beatport_text(self, text: str) -> str:
"""Clean Beatport track/artist text for proper spacing"""
if not text:
return text
# Fix common spacing issues
text = re.sub(r'([a-z$!@#%&*])([A-Z])', r'\1 \2', text) # Add space between lowercase/symbols and uppercase
text = re.sub(r'([a-zA-Z]),([a-zA-Z])', r'\1, \2', text) # Add space after comma
text = re.sub(r'([a-zA-Z])(Mix|Remix|Extended|Version)\b', r'\1 \2', text) # Fix mix types
text = re.sub(r'\s+', ' ', text) # Collapse multiple spaces
text = text.strip()
return text
def scrape_top_10_releases_homepage(self, limit: int = 10) -> List[Dict]:
"""Scrape Top 10 Releases from homepage - Extract individual tracks using URL crawling"""
print("\n🔟 Scraping Top 10 Releases from homepage...")

@ -2141,6 +2141,12 @@ def get_beatport_hero_tracks():
url = track.get('url', '').strip()
image_url = track.get('image_url', '').strip()
# Apply text cleaning for proper spacing
if title:
title = clean_beatport_text(title)
if artist:
artist = clean_beatport_text(artist)
# Validation filters
is_valid = True
skip_reasons = []

Loading…
Cancel
Save