diff --git a/beatport_unified_scraper.py b/beatport_unified_scraper.py index 42352c53..f70cc822 100644 --- a/beatport_unified_scraper.py +++ b/beatport_unified_scraper.py @@ -1680,7 +1680,7 @@ class BeatportUnifiedScraper: return None def scrape_new_on_beatport_hero(self, limit: int = 10) -> List[Dict]: - """Scrape the 'New on Beatport' hero slideshow from homepage""" + """Scrape the 'New on Beatport' hero slideshow from homepage using data-testid standard""" print("\nšŸŽÆ Scraping 'New on Beatport' hero slideshow...") soup = self.get_page(self.base_url) @@ -1689,17 +1689,27 @@ class BeatportUnifiedScraper: tracks = [] - # Method 1: Look for the specific wrapper class you mentioned - hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ') - if hero_wrapper: - print(" āœ… Found Homepage NewOnBeatportWrapper") - tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit)) + # Method 1 (PRIMARY): Use data-testid standard like all other rebuild functions + hero_items = soup.select('[data-testid="new-on-beatport"]') + if hero_items: + print(f" āœ… Found {len(hero_items)} items using data-testid='new-on-beatport'") + for i, item in enumerate(hero_items[:limit]): + track_data = self._extract_track_from_slide(item, f"Hero Item {i+1}") + if track_data and track_data.get('url'): + tracks.append(track_data) + + # Method 2 (FALLBACK): Look for the specific wrapper class (legacy support) + if len(tracks) < 5: + hero_wrapper = soup.find('div', class_='Homepage-style__NewOnBeatportWrapper-sc-deeb4244-2 iyIchZ') + if hero_wrapper: + print(" āœ… Found Homepage NewOnBeatportWrapper (fallback)") + tracks.extend(self._extract_from_hero_wrapper(hero_wrapper, limit)) - # Method 2: Look for carousel with aria attributes you mentioned - if len(tracks) < 5: # Only try if we don't have enough tracks + # Method 3 (FALLBACK): Look for carousel with aria attributes + if len(tracks) < 5: carousel = soup.find('div', {'aria-roledescription': 'carousel', 'aria-label': 'Carousel'}) if carousel: - print(" āœ… Found carousel with aria-roledescription and aria-label") + print(" āœ… Found carousel with aria-roledescription and aria-label (fallback)") additional_tracks = self._extract_from_carousel(carousel, limit) # Merge without duplicates existing_urls = {track.get('url') for track in tracks} @@ -1707,9 +1717,9 @@ class BeatportUnifiedScraper: if track.get('url') not in existing_urls: tracks.append(track) - # Method 3: Look for individual slide items more broadly + # Method 4 (LAST RESORT): Look for individual slide items more broadly if len(tracks) < 5: - print(" šŸ” Looking for individual carousel items...") + print(" šŸ” Looking for individual carousel items (last resort)...") carousel_items = soup.find_all(['div', 'article'], class_=re.compile(r'carousel.*item|item.*carousel|slide', re.I)) print(f" Found {len(carousel_items)} potential carousel items") @@ -1890,20 +1900,31 @@ class BeatportUnifiedScraper: # Apply final cleaning to all extracted data if track_data.get('title'): - track_data['title'] = self._clean_title(track_data['title']) + track_data['title'] = self.clean_beatport_text(self._clean_title(track_data['title'])) if track_data.get('artist'): - track_data['artist'] = self._clean_artist(track_data['artist']) + track_data['artist'] = self.clean_beatport_text(self._clean_artist(track_data['artist'])) # Extract all class names for debugging classes = slide.get('class', []) if classes: track_data['element_classes'] = ' '.join(classes) - # Only return if we found at least some useful data - if track_data.get('title') or track_data.get('artist') or track_data.get('url') or track_data.get('image_url'): + # Filter out empty/invalid tracks + title = track_data.get('title', '').strip() + artist = track_data.get('artist', '').strip() + + # Skip tracks with no title/artist or generic values + if (not title or not artist or + title.lower() in ['no title', 'unknown title', 'unknown', ''] or + artist.lower() in ['no artist', 'unknown artist', 'unknown', 'various artists', '']): + print(f" āŒ {context}: Filtered out invalid track - '{title}' by '{artist}'") + return None + + # Only return if we found meaningful data + if track_data.get('url') or track_data.get('image_url'): track_data['source'] = f"New on Beatport Hero - {context}" track_data['scraped_at'] = time.time() - print(f" āœ… {context}: {track_data.get('title', 'No title')} - {track_data.get('artist', 'No artist')}") + print(f" āœ… {context}: {title} - {artist}") return track_data else: print(f" āŒ {context}: No usable data found") @@ -2143,6 +2164,20 @@ class BeatportUnifiedScraper: return ' '.join(cleaned_words) return artist + def clean_beatport_text(self, text: str) -> str: + """Clean Beatport track/artist text for proper spacing""" + if not text: + return text + + # Fix common spacing issues + text = re.sub(r'([a-z$!@#%&*])([A-Z])', r'\1 \2', text) # Add space between lowercase/symbols and uppercase + text = re.sub(r'([a-zA-Z]),([a-zA-Z])', r'\1, \2', text) # Add space after comma + text = re.sub(r'([a-zA-Z])(Mix|Remix|Extended|Version)\b', r'\1 \2', text) # Fix mix types + text = re.sub(r'\s+', ' ', text) # Collapse multiple spaces + text = text.strip() + + return text + def scrape_top_10_releases_homepage(self, limit: int = 10) -> List[Dict]: """Scrape Top 10 Releases from homepage - Extract individual tracks using URL crawling""" print("\nšŸ”Ÿ Scraping Top 10 Releases from homepage...") diff --git a/web_server.py b/web_server.py index 070a2b09..5fc7b33e 100644 --- a/web_server.py +++ b/web_server.py @@ -2141,6 +2141,12 @@ def get_beatport_hero_tracks(): url = track.get('url', '').strip() image_url = track.get('image_url', '').strip() + # Apply text cleaning for proper spacing + if title: + title = clean_beatport_text(title) + if artist: + artist = clean_beatport_text(artist) + # Validation filters is_valid = True skip_reasons = []