beatport progress

pull/49/head
Broque Thomas 8 months ago
parent 5981bd772f
commit 526aeb0bda

@ -968,7 +968,7 @@ class BeatportUnifiedScraper:
return hype_releases
def scrape_top_10_releases_homepage(self, limit: int = 10) -> List[Dict]:
"""Scrape Top 10 Releases from homepage section - NEW"""
"""Scrape Top 10 Releases from homepage section - Fixed to improve title extraction"""
print("\n🔟 Scraping Top 10 Releases from homepage...")
soup = self.get_page(self.base_url)
@ -985,25 +985,85 @@ class BeatportUnifiedScraper:
rank_elem = item.select_one('[data-testid="track-number"]')
rank = rank_elem.get_text(strip=True) if rank_elem else str(i + 1)
# Extract release data
release_data = self.extract_release_data_from_card(item)
if release_data:
# Convert to track format for compatibility
track_data = {
'position': int(rank) if rank.isdigit() else i + 1,
'rank': rank,
'artist': release_data['artist'],
'title': release_data['title'],
'list_name': 'Top 10 Releases',
'url': release_data['url'],
'label': release_data.get('label', 'Unknown Label'),
'image_url': release_data.get('image_url'),
'price': release_data.get('price'),
'badges': release_data.get('badges', []),
'type': 'release',
'top_10': True
}
top_releases.append(track_data)
# Try to extract better title information
title = "Unknown Title"
# Define badges/labels to filter out when looking for titles
badge_keywords = ['EXCLUSIVE', 'HYPE', 'NEW', 'HOT', 'FEATURED', 'STAFF PICK']
# Method 1: Look for track title specifically
track_title_elem = item.select_one('[class*="track-title"], [class*="TrackTitle"], [data-testid*="track-title"]')
if track_title_elem:
potential_title = track_title_elem.get_text(strip=True)
if potential_title.upper() not in badge_keywords:
title = potential_title
if title == "Unknown Title":
# Method 2: Look for release name (fallback)
release_name_elem = item.select_one('[class*="ReleaseName"], [class*="release-name"], [class*="release-title"]')
if release_name_elem:
potential_title = release_name_elem.get_text(strip=True)
if potential_title.upper() not in badge_keywords:
title = potential_title
if title == "Unknown Title":
# Method 3: Try to get from any link text that's not an artist or label
link_elems = item.select('a')
for link in link_elems:
link_text = link.get_text(strip=True)
# Skip if it's clearly an artist link, label link, empty, or a badge
if (link_text and
'/artist/' not in link.get('href', '') and
'/label/' not in link.get('href', '') and
link_text.upper() not in badge_keywords):
title = link_text
break
# Final fallback: if we still have Unknown Title, try any text that's not a badge
if title == "Unknown Title":
all_text_elems = item.find_all(text=True)
for text_elem in all_text_elems:
text = text_elem.strip()
if (text and
len(text) > 3 and # Must be more than 3 characters
text.upper() not in badge_keywords and
not text.isdigit() and # Not just a number
'$' not in text): # Not a price
title = text
break
# Extract artists (original working method)
artist_elems = item.select('[href*="/artist/"]')
artists = []
for artist_elem in artist_elems:
artist_name = artist_elem.get_text(strip=True)
if artist_name and artist_name not in artists:
artists.append(artist_name)
# Extract other data
link_elem = item.select_one('a[href*="/release/"]')
release_url = urljoin(self.base_url, link_elem.get('href')) if link_elem else ""
label_elem = item.select_one('[href*="/label/"]')
label = label_elem.get_text(strip=True) if label_elem else "Unknown Label"
img_elem = item.select_one('img')
image_url = img_elem.get('src') if img_elem else None
# Convert to track format for compatibility
track_data = {
'position': int(rank) if rank.isdigit() else i + 1,
'rank': rank,
'artist': ', '.join(artists) if artists else "Unknown Artist",
'title': title,
'list_name': 'Top 10 Releases',
'url': release_url,
'label': label,
'image_url': image_url,
'type': 'release',
'top_10': True
}
top_releases.append(track_data)
print(f"✅ Extracted {len(top_releases)} releases from Top 10 Releases")
return top_releases

@ -453,7 +453,7 @@
<div class="homepage-releases-section">
<h3 class="section-title">🎵 Releases</h3>
<div class="genre-chart-types-grid">
<div class="genre-chart-type-card" data-chart-type="releases-top-10" data-chart-endpoint="/api/beatport/homepage/releases-top-10">
<div class="genre-chart-type-card" data-chart-type="releases-top-10" data-chart-endpoint="/api/beatport/homepage/top-10-releases">
<div class="chart-type-icon">🆕</div>
<div class="chart-type-info">
<h3>Top 10 Releases</h3>
@ -469,7 +469,7 @@
<span class="track-count">100 releases</span>
</div>
</div>
<div class="genre-chart-type-card" data-chart-type="latest-releases" data-chart-endpoint="/api/beatport/homepage/latest-releases">
<div class="genre-chart-type-card" data-chart-type="latest-releases" data-chart-endpoint="/api/beatport/homepage/new-releases">
<div class="chart-type-icon">🕒</div>
<div class="chart-type-info">
<h3>Latest Releases</h3>

@ -10409,7 +10409,7 @@ async function handleHomepageChartTypeClick(chartType, chartEndpoint, chartName)
limit: 100
},
'releases-top-10': {
endpoint: `/api/beatport/homepage/releases-top-10`, // Placeholder for future
endpoint: `/api/beatport/homepage/top-10-releases`, // Working route
name: `Top 10 Releases`,
limit: 10
},
@ -10419,7 +10419,7 @@ async function handleHomepageChartTypeClick(chartType, chartEndpoint, chartName)
limit: 100
},
'latest-releases': {
endpoint: `/api/beatport/homepage/latest-releases`, // Placeholder for future
endpoint: `/api/beatport/homepage/new-releases`, // Use new-releases as fallback for now
name: `Latest Releases`,
limit: 50
},

Loading…
Cancel
Save