fix hero slider

8 months ago · 0d54e4a62a
parent bdb087f8d5
commit 0d54e4a62a
2 changed files with 57 additions and 16 deletions
--- a/beatport_unified_scraper.py
+++ b/beatport_unified_scraper.py
@ -1680,7 +1680,7 @@ class BeatportUnifiedScraper:
            return None

    def scrape_new_on_beatport_hero(self, limit: int = 10) -> List[Dict]:
-        """Scrape the 'New on Beatport' hero slideshow from homepage"""
+        """Scrape the 'New on Beatport' hero slideshow from homepage using data-testid standard"""
        print("\n🎯 Scraping 'New on Beatport' hero slideshow...")

        soup = self.get_page(self.base_url)
@ -1689,17 +1689,27 @@ class BeatportUnifiedScraper:

        tracks = []

-        # Method 1: Look for the specific wrapper class you mentioned
-        hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ')
-        if hero_wrapper:
-            print("   ✅ Found Homepage NewOnBeatportWrapper")
-            tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit))
+        # Method 1 (PRIMARY): Use data-testid standard like all other rebuild functions
+        hero_items = soup.select('[data-testid="new-on-beatport"]')
+        if hero_items:
+            print(f"   ✅ Found {len(hero_items)} items using data-testid='new-on-beatport'")
+            for i, item in enumerate(hero_items[:limit]):
+                track_data = self._extract_track_from_slide(item, f"Hero Item {i+1}")
+                if track_data and track_data.get('url'):
+                    tracks.append(track_data)
+
+        # Method 2 (FALLBACK): Look for the specific wrapper class (legacy support)
+        if len(tracks) < 5:
+            hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ')
+            if hero_wrapper:
+                print("   ✅ Found Homepage NewOnBeatportWrapper (fallback)")
+                tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit))

-        # Method 2: Look for carousel with aria attributes you mentioned
-        if len(tracks) < 5:  # Only try if we don't have enough tracks
+        # Method 3 (FALLBACK): Look for carousel with aria attributes
+        if len(tracks) < 5:
            carousel = soup.find('div', {'aria-roledescription': 'carousel', 'aria-label': 'Carousel'})
            if carousel:
-                print("   ✅ Found carousel with aria-roledescription and aria-label")
+                print("   ✅ Found carousel with aria-roledescription and aria-label (fallback)")
                additional_tracks = self._extract_from_carousel(carousel, limit)
                # Merge without duplicates
                existing_urls = {track.get('url') for track in tracks}
@ -1707,9 +1717,9 @@ class BeatportUnifiedScraper:
                    if track.get('url') not in existing_urls:
                        tracks.append(track)

-        # Method 3: Look for individual slide items more broadly
+        # Method 4 (LAST RESORT): Look for individual slide items more broadly
        if len(tracks) < 5:
-            print("   🔍 Looking for individual carousel items...")
+            print("   🔍 Looking for individual carousel items (last resort)...")
            carousel_items = soup.find_all(['div', 'article'], class_=re.compile(r'carousel.*item|item.*carousel|slide', re.I))
            print(f"   Found {len(carousel_items)} potential carousel items")

@ -1890,20 +1900,31 @@ class BeatportUnifiedScraper:

            # Apply final cleaning to all extracted data
            if track_data.get('title'):
-                track_data['title'] = self._clean_title(track_data['title'])
+                track_data['title'] = self.clean_beatport_text(self._clean_title(track_data['title']))
            if track_data.get('artist'):
-                track_data['artist'] = self._clean_artist(track_data['artist'])
+                track_data['artist'] = self.clean_beatport_text(self._clean_artist(track_data['artist']))

            # Extract all class names for debugging
            classes = slide.get('class', [])
            if classes:
                track_data['element_classes'] = ' '.join(classes)

-            # Only return if we found at least some useful data
-            if track_data.get('title') or track_data.get('artist') or track_data.get('url') or track_data.get('image_url'):
+            # Filter out empty/invalid tracks
+            title = track_data.get('title', '').strip()
+            artist = track_data.get('artist', '').strip()
+
+            # Skip tracks with no title/artist or generic values
+            if (not title or not artist or
+                title.lower() in ['no title', 'unknown title', 'unknown', ''] or
+                artist.lower() in ['no artist', 'unknown artist', 'unknown', 'various artists', '']):
+                print(f"   ❌ {context}: Filtered out invalid track - '{title}' by '{artist}'")
+                return None
+
+            # Only return if we found meaningful data
+            if track_data.get('url') or track_data.get('image_url'):
                track_data['source'] = f"New on Beatport Hero - {context}"
                track_data['scraped_at'] = time.time()
-                print(f"   ✅ {context}: {track_data.get('title', 'No title')} - {track_data.get('artist', 'No artist')}")
+                print(f"   ✅ {context}: {title} - {artist}")
                return track_data
            else:
                print(f"   ❌ {context}: No usable data found")
@ -2143,6 +2164,20 @@ class BeatportUnifiedScraper:
            return ' '.join(cleaned_words)
        return artist

+    def clean_beatport_text(self, text: str) -> str:
+        """Clean Beatport track/artist text for proper spacing"""
+        if not text:
+            return text
+
+        # Fix common spacing issues
+        text = re.sub(r'([a-z$!@#%&*])([A-Z])', r'\1 \2', text)  # Add space between lowercase/symbols and uppercase
+        text = re.sub(r'([a-zA-Z]),([a-zA-Z])', r'\1, \2', text)  # Add space after comma
+        text = re.sub(r'([a-zA-Z])(Mix|Remix|Extended|Version)\b', r'\1 \2', text)  # Fix mix types
+        text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
+        text = text.strip()
+
+        return text
+
    def scrape_top_10_releases_homepage(self, limit: int = 10) -> List[Dict]:
        """Scrape Top 10 Releases from homepage - Extract individual tracks using URL crawling"""
        print("\n🔟 Scraping Top 10 Releases from homepage...")
--- a/web_server.py
+++ b/web_server.py
@ -2141,6 +2141,12 @@ def get_beatport_hero_tracks():
            url = track.get('url', '').strip()
            image_url = track.get('image_url', '').strip()

+            # Apply text cleaning for proper spacing
+            if title:
+                title = clean_beatport_text(title)
+            if artist:
+                artist = clean_beatport_text(artist)
+
            # Validation filters
            is_valid = True
            skip_reasons = []