beatport progress

pull/49/head
Broque Thomas 8 months ago
parent 27b1e57d97
commit dc8e3c1033

@ -445,7 +445,7 @@ class BeatportUnifiedScraper:
try:
# Get track title
raw_title = link.get_text(strip=True)
raw_title = link.get_text(separator=' ', strip=True)
if not raw_title:
continue
@ -978,32 +978,78 @@ class BeatportUnifiedScraper:
return tracks
def scrape_genre_hype_picks(self, genre: Dict, limit: int = 50) -> List[Dict]:
"""Scrape hype picks for a specific genre"""
"""Scrape hype picks for a specific genre - FIXED VERSION"""
tracks = []
# Method 1: Try dedicated hype picks URL (similar to hype-100)
hype_picks_urls = [
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}/hype-picks",
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}/picks",
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}" # Main page
# Try multiple hype-related URLs
hype_urls_to_try = [
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}/hype-100",
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}/hype",
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}/hype-10",
f"{self.base_url}/genre/{genre['slug']}/{genre['id']}" # Main page as fallback
]
for hype_url in hype_picks_urls:
print(f" ⚡ Trying hype picks URL: {hype_url}")
for hype_url in hype_urls_to_try:
print(f" 🔥 Trying hype URL: {hype_url}")
soup = self.get_page(hype_url)
if soup:
# Extract hype picks from carousel and individual HYPE labeled tracks
tracks = self.extract_comprehensive_hype_picks(soup, f"{genre['name']} Hype Picks", limit)
if tracks and len(tracks) >= min(limit, 10):
print(f" ✅ Successfully extracted {len(tracks)} hype picks from {hype_url}")
# First try direct track extraction
tracks = self.extract_tracks_from_page(soup, f"{genre['name']} Hype Picks", limit)
if len(tracks) >= 10: # Good result
print(f" ✅ Found {len(tracks)} hype tracks from {hype_url}")
break
elif tracks:
print(f" ⚠️ Only found {len(tracks)} hype picks at {hype_url}, trying next URL...")
elif len(tracks) > 0:
print(f" ⚠️ Only found {len(tracks)} hype tracks, trying next URL...")
else:
print(f" ❌ No hype picks found at {hype_url}")
print(f" ❌ No hype tracks found at {hype_url}")
# If main page, try to find hype section
if hype_url.endswith(genre['id']):
print(f" 🔍 Searching for hype section on main genre page...")
hype_section_tracks = self.find_hype_section_on_genre_page(soup, genre, limit)
if hype_section_tracks:
tracks = hype_section_tracks
print(f" ✅ Found {len(tracks)} tracks in hype section")
break
return tracks[:limit]
def find_hype_section_on_genre_page(self, soup, genre: Dict, limit: int) -> List[Dict]:
"""Find and extract tracks from hype section on main genre page"""
tracks = []
# Look for headings containing "hype"
hype_headings = soup.find_all(['h1', 'h2', 'h3', 'h4'],
string=re.compile(r'hype', re.I))
for heading in hype_headings:
print(f" 📝 Found hype heading: {heading.get_text(strip=True)}")
# Get the section after this heading
section_container = heading.find_parent()
if section_container:
# Look for tracks in the next sibling or current container
content_areas = [
section_container.find_next_sibling(),
section_container
]
for content_area in content_areas:
if content_area:
section_tracks = self.extract_tracks_from_page(
content_area, f"{genre['name']} Hype Picks", limit
)
if section_tracks:
tracks.extend(section_tracks)
if len(tracks) >= limit:
break
if tracks:
break
return tracks
def extract_comprehensive_hype_picks(self, soup: BeautifulSoup, list_name: str, limit: int) -> List[Dict]:
"""Extract hype picks using multiple methods to get full 50 tracks"""
tracks = []
@ -1061,7 +1107,7 @@ class BeatportUnifiedScraper:
if not title_link:
continue
track_title = title_link.get_text(strip=True)
track_title = title_link.get_text(separator=' ', strip=True)
track_url = urljoin(self.base_url, title_link['href'])
# Use release artist as fallback
@ -1160,7 +1206,7 @@ class BeatportUnifiedScraper:
# Extract track info from the first track link in this container
for link in track_links[:1]: # Just take the first track from each HYPE container
try:
raw_title = link.get_text(strip=True)
raw_title = link.get_text(separator=' ', strip=True)
if not raw_title or len(raw_title) < 2:
continue
@ -1308,7 +1354,7 @@ class BeatportUnifiedScraper:
if not title_elem:
title_elem = title_link
track_title = title_elem.get_text(strip=True)
track_title = title_elem.get_text(separator=' ', strip=True)
# Extract artists
artist_container = item.find(class_=re.compile(r'ArtistNames'))
@ -1365,7 +1411,7 @@ class BeatportUnifiedScraper:
if not title_elem:
title_elem = title_link
track_title = title_elem.get_text(strip=True)
track_title = title_elem.get_text(separator=' ', strip=True)
# Extract artists
artist_container = row.find(class_=re.compile(r'ArtistNames'))
@ -1397,7 +1443,7 @@ class BeatportUnifiedScraper:
return tracks
def scrape_genre_staff_picks(self, genre: Dict, limit: int = 50) -> List[Dict]:
"""Scrape staff picks for a specific genre"""
"""Scrape staff picks for a specific genre - FIXED VERSION"""
genre_url = f"{self.base_url}/genre/{genre['slug']}/{genre['id']}"
soup = self.get_page(genre_url)
@ -1406,14 +1452,13 @@ class BeatportUnifiedScraper:
tracks = []
# Look for staff picks, editorial, or featured sections on genre page
staff_sections = [
'staff pick', 'editorial', 'featured', 'editor', 'hype pick',
'weekend pick', 'best new', 'exclusives'
# Method 1: Look for editorial/staff pick sections directly
editorial_sections = [
'staff pick', 'editorial', 'featured', 'editor pick',
'beatport picks', 'weekend pick', 'best new', 'exclusives'
]
for section_name in staff_sections:
# Find section headings that match staff pick patterns
for section_name in editorial_sections:
section_heading = soup.find(['h1', 'h2', 'h3', 'h4'],
string=re.compile(rf'{section_name}', re.I))
@ -1428,29 +1473,41 @@ class BeatportUnifiedScraper:
)
if section_tracks:
tracks.extend(section_tracks)
break # Found staff picks, no need to continue
break # Found staff picks, stop looking
# If no specific staff picks section found, try to find any editorial content
# Method 2: If no direct sections found, look for editorial chart collections
if not tracks:
print(f" 🔍 No specific staff picks section found, looking for editorial content...")
# Look for DJ charts or featured charts on the genre page
print(f" 🔍 No direct staff picks section found, checking editorial charts...")
chart_links = soup.find_all('a', href=re.compile(r'/chart/'))
editorial_charts = []
for chart_link in chart_links[:10]: # Limit to first 10 charts
chart_name = chart_link.get_text(strip=True)
if chart_name and len(chart_name) > 3:
track_info = {
'position': len(tracks) + 1,
'artist': 'Various Artists',
'title': chart_name,
'list_name': f"{genre['name']} Staff Picks",
'url': urljoin(self.base_url, chart_link.get('href', '')),
'chart_type': 'staff_pick'
}
tracks.append(track_info)
if len(tracks) >= limit:
break
chart_href = chart_link.get('href', '')
return tracks
# Filter for editorial-style chart names
if any(keyword in chart_name.lower() for keyword in
['best new', 'weekend pick', 'editor', 'staff', 'beatport picks', 'exclusive']):
editorial_charts.append((chart_name, chart_href))
print(f" 📊 Found {len(editorial_charts)} editorial charts")
# Extract tracks from editorial charts
for chart_name, chart_href in editorial_charts[:3]: # Limit to 3 charts
if len(tracks) >= limit:
break
print(f" 📊 Processing editorial chart: {chart_name}")
chart_url = urljoin(self.base_url, chart_href)
remaining_limit = limit - len(tracks)
chart_tracks = self.extract_tracks_from_chart(chart_url, chart_name, remaining_limit)
if chart_tracks:
tracks.extend(chart_tracks)
print(f" ✅ Added {len(chart_tracks)} tracks from {chart_name}")
return tracks[:limit]
def scrape_genre_latest_releases(self, genre: Dict, limit: int = 50) -> List[Dict]:
"""Scrape latest releases for a specific genre"""
@ -1489,59 +1546,43 @@ class BeatportUnifiedScraper:
return tracks
def scrape_genre_new_charts(self, genre: Dict, limit: int = 100) -> List[Dict]:
"""Scrape tracks from new charts (DJ/artist curated) for a specific genre"""
"""Scrape NEW CHARTS COLLECTION - Returns list of charts, not individual tracks"""
genre_url = f"{self.base_url}/genre/{genre['slug']}/{genre['id']}"
soup = self.get_page(genre_url)
if not soup:
return []
tracks = []
# Look for DJ charts, artist charts, or curated content on genre page
charts = []
chart_links = soup.find_all('a', href=re.compile(r'/chart/'))
# Extract tracks from each chart until we reach the limit
charts_processed = 0
max_charts = 10 # Limit number of charts to process for performance
for chart_link in chart_links[:max_charts]:
if len(tracks) >= limit:
break
print(f" 🔍 Found {len(chart_links)} chart links on genre page")
for chart_link in chart_links[:limit]:
chart_name = chart_link.get_text(strip=True)
chart_href = chart_link.get('href', '')
if chart_name and chart_href and len(chart_name) > 3:
charts_processed += 1
print(f" 📊 Processing chart {charts_processed}: {chart_name}")
# Get tracks from this chart
chart_url = urljoin(self.base_url, chart_href)
chart_tracks = self.extract_tracks_from_chart(chart_url, chart_name, min(20, limit - len(tracks)))
if chart_tracks:
tracks.extend(chart_tracks)
print(f" ✅ Added {len(chart_tracks)} tracks from {chart_name}")
else:
print(f" ❌ No tracks found in {chart_name}")
# If not enough tracks from charts, get additional content from the genre page
if len(tracks) < limit:
print(f" 🔍 Getting additional tracks from genre page to reach {limit} total...")
additional_tracks = self.extract_tracks_from_page(soup, f"New {genre['name']} Charts", limit - len(tracks))
# Create chart metadata entry (not individual tracks)
chart_info = {
'position': len(charts) + 1,
'artist': 'Various Artists', # Charts are compilations
'title': chart_name,
'list_name': f"{genre['name']} New Charts",
'url': urljoin(self.base_url, chart_href),
'chart_name': chart_name,
'chart_type': 'new_chart',
'genre': genre['name']
}
charts.append(chart_info)
# Avoid duplicates
for track in additional_tracks:
if not any(existing['url'] == track['url'] for existing in tracks):
tracks.append(track)
if len(tracks) >= limit:
break
print(f" 📊 Chart {len(charts)}: {chart_name}")
return tracks[:limit]
print(f" ✅ Found {len(charts)} charts in New Charts Collection")
return charts[:limit]
def extract_tracks_from_chart(self, chart_url: str, chart_name: str, limit: int) -> List[Dict]:
"""Extract individual tracks from a chart page"""
"""Extract individual tracks from a chart page - OPTIMIZED FOR CHART PAGES"""
tracks = []
try:
@ -1549,48 +1590,207 @@ class BeatportUnifiedScraper:
if not soup:
return tracks
# Look for track items in the chart
track_items = soup.find_all(class_=re.compile(r'Track.*Item|Lists.*Item|Table.*Row'))
print(f" 🔍 Extracting tracks from chart page: {chart_url}")
print(f" 📋 Chart name: {chart_name}")
for item in track_items[:limit]:
try:
# Skip header rows
if item.get('role') == 'columnheader':
continue
# DEBUG: Check page title to confirm we're on the right page
page_title = soup.find('title')
if page_title:
print(f" 📄 Page title: {page_title.get_text(strip=True)}")
# Extract track title
title_link = item.find('a', href=re.compile(r'/track/'))
if not title_link:
continue
# DEBUG: Look for the chart title on the page
chart_title_elem = soup.find(['h1', 'h2'], string=re.compile(chart_name.split(':')[0], re.I))
if chart_title_elem:
print(f" ✅ Found chart title on page: {chart_title_elem.get_text(strip=True)}")
else:
print(f" ⚠️ Chart title '{chart_name}' not found on page")
track_title = title_link.get_text(strip=True)
track_url = urljoin(self.base_url, title_link['href'])
# Method 1: Try chart-specific table extraction first (most reliable for chart pages)
tracks = self.extract_tracks_from_chart_table(soup, chart_name, limit)
# Extract artist
artist_container = item.find(class_=re.compile(r'ArtistNames|artist'))
if artist_container:
artist_links = artist_container.find_all('a', href=re.compile(r'/artist/'))
artists = [link.get_text(strip=True) for link in artist_links]
artist_text = ', '.join(artists) if artists else 'Unknown Artist'
else:
artist_text = 'Unknown Artist'
track_data = {
'position': len(tracks) + 1,
'artist': artist_text,
'title': track_title,
'list_name': f"New Chart: {chart_name}",
'url': track_url,
'chart_source': chart_name
}
if len(tracks) >= 10:
print(f" ✅ Chart table extraction found {len(tracks)} tracks")
return tracks
tracks.append(track_data)
# Method 2: Fallback to general page extraction
print(f" ⚠️ Chart table extraction found {len(tracks)} tracks, trying general extraction...")
general_tracks = self.extract_tracks_from_page(soup, f"New Chart: {chart_name}", limit)
except Exception:
continue
if len(general_tracks) > len(tracks):
tracks = general_tracks
print(f" ✅ General extraction found {len(tracks)} tracks")
# Method 3: Last resort - generic table extraction
if len(tracks) < 10:
print(f" ⚠️ Still low track count, trying generic table extraction...")
table_tracks = self.extract_tracks_from_table_format(soup, chart_name, limit)
if len(table_tracks) > len(tracks):
tracks = table_tracks
print(f" ✅ Generic table extraction found {len(tracks)} tracks")
print(f" 📊 Final result: {len(tracks)} tracks extracted from {chart_name}")
return tracks
except Exception as e:
print(f" ❌ Error extracting tracks from chart {chart_name}: {e}")
return []
def extract_tracks_from_chart_table(self, soup, chart_name: str, limit: int) -> List[Dict]:
"""Extract tracks from Beatport chart table structure (tracks-table class)"""
tracks = []
print(f" 🔍 DEBUG: Looking for tracks-table container...")
# Look for the tracks table container
tracks_table = soup.find(class_=re.compile(r'tracks-table'))
if not tracks_table:
print(f" ⚠️ No tracks-table container found")
# Debug: Let's see what table classes ARE available
all_tables = soup.find_all(['table', 'div'], class_=re.compile(r'table|Table', re.I))
print(f" 🔍 DEBUG: Found {len(all_tables)} table-like elements")
for i, table in enumerate(all_tables[:5]):
classes = table.get('class', [])
print(f" Table {i+1}: {' '.join(classes)}")
return tracks
print(f" ✅ Found tracks-table container with classes: {tracks_table.get('class', [])}")
# Find all track rows using data-testid or table row classes
track_rows_testid = tracks_table.find_all(['div', 'tr'], attrs={'data-testid': 'tracks-table-row'})
track_rows_class = tracks_table.find_all(class_=re.compile(r'Table.*Row.*tracks-table'))
track_rows_generic = tracks_table.find_all(class_=re.compile(r'Table.*Row'))
print(f" 🔍 DEBUG: Track rows found:")
print(f" - By data-testid='tracks-table-row': {len(track_rows_testid)}")
print(f" - By class pattern 'Table.*Row.*tracks-table': {len(track_rows_class)}")
print(f" - By generic 'Table.*Row': {len(track_rows_generic)}")
# Use the best available option
track_rows = track_rows_testid or track_rows_class or track_rows_generic
if not track_rows:
print(f" ❌ No track rows found in any format")
return tracks
print(f" 🔍 Using {len(track_rows)} track rows for extraction")
for i, row in enumerate(track_rows[:limit]):
try:
# Skip header rows
if row.get('role') == 'columnheader':
continue
# Find track title link - look for the specific structure
title_cell = row.find(class_=re.compile(r'cell.*title|title.*cell'))
if not title_cell:
# Fallback: look for any cell with track links
title_cell = row
track_link = title_cell.find('a', href=re.compile(r'/track/'))
if not track_link:
continue
# Extract track title from the ReleaseName span or link text
title_span = track_link.find(class_=re.compile(r'ReleaseName'))
if title_span:
track_title = title_span.get_text(separator=' ', strip=True)
else:
track_title = track_link.get_text(separator=' ', strip=True)
track_url = urljoin(self.base_url, track_link['href'])
# Extract artists from ArtistNames container
artists = []
artist_container = row.find(class_=re.compile(r'ArtistNames'))
if artist_container:
artist_links = artist_container.find_all('a', href=re.compile(r'/artist/'))
artists = [link.get_text(strip=True) for link in artist_links]
artist_text = ', '.join(artists) if artists else 'Unknown Artist'
# DEBUG: Print track details for first few
if len(tracks) < 3:
print(f" 🔍 DEBUG Track {len(tracks)+1}:")
print(f" Title: '{track_title}'")
print(f" Artist: '{artist_text}'")
print(f" URL: {track_url}")
print(f" Track link href: {track_link.get('href', 'NO HREF')}")
# Extract track number if available
track_no_elem = row.find(class_=re.compile(r'TrackNo'))
position = track_no_elem.get_text(strip=True) if track_no_elem else str(len(tracks) + 1)
track_data = {
'position': position,
'artist': artist_text,
'title': track_title,
'list_name': f"Chart: {chart_name}",
'url': track_url,
'chart_source': chart_name
}
tracks.append(track_data)
# Debug output for first few tracks
if len(tracks) <= 5:
print(f" 🎵 Track {len(tracks)}: {artist_text} - {track_title}")
except Exception as e:
print(f" ⚠️ Error parsing track row {i+1}: {e}")
continue
print(f" ✅ Chart table extraction completed: {len(tracks)} tracks found")
return tracks
def extract_tracks_from_table_format(self, soup, chart_name: str, limit: int) -> List[Dict]:
"""Extract tracks from table format (for charts that use table layout)"""
tracks = []
# Look for table rows containing track data
table_rows = soup.find_all('tr') + soup.find_all('div', class_=re.compile(r'Table.*Row|track.*row', re.I))
print(f" 🔍 Found {len(table_rows)} potential table rows")
for i, row in enumerate(table_rows[:limit]):
try:
# Skip header rows
if row.name == 'tr' and row.find('th'):
continue
# Look for track links
track_links = row.find_all('a', href=re.compile(r'/track/'))
if not track_links:
continue
track_link = track_links[0]
track_title = track_link.get_text(separator=' ', strip=True)
track_url = urljoin(self.base_url, track_link['href'])
# Look for artist information
artist_text = 'Unknown Artist'
# Try multiple methods to find artist
artist_links = row.find_all('a', href=re.compile(r'/artist/'))
if artist_links:
artists = [link.get_text(strip=True) for link in artist_links]
artist_text = ', '.join(artists)
track_data = {
'position': len(tracks) + 1,
'artist': artist_text,
'title': track_title,
'list_name': f"New Chart: {chart_name}",
'url': track_url,
'chart_source': chart_name
}
tracks.append(track_data)
if len(tracks) <= 3: # Debug first few
print(f" 🎵 Track {len(tracks)}: {artist_text} - {track_title}")
except Exception as e:
continue
return tracks

@ -11942,6 +11942,51 @@ def get_beatport_genre_tracks(genre_slug, genre_id):
"count": 0
}), 500
@app.route('/api/beatport/chart/extract', methods=['POST'])
def extract_beatport_chart_tracks():
"""Extract tracks from a specific Beatport chart URL"""
try:
data = request.get_json()
chart_url = data.get('chart_url')
chart_name = data.get('chart_name', 'Unknown Chart')
limit = int(data.get('limit', 100))
if not chart_url:
return jsonify({
"success": False,
"error": "chart_url is required",
"tracks": [],
"count": 0
}), 400
logger.info(f"🔍 API request to extract tracks from chart: {chart_name}")
logger.info(f"🔗 Chart URL: {chart_url}")
# Initialize the Beatport scraper
scraper = BeatportUnifiedScraper()
# Extract tracks from the specific chart URL
tracks = scraper.extract_tracks_from_chart(chart_url, chart_name, limit)
logger.info(f"✅ Successfully extracted {len(tracks)} tracks from chart: {chart_name}")
return jsonify({
"success": True,
"tracks": tracks,
"chart_name": chart_name,
"chart_url": chart_url,
"count": len(tracks)
})
except Exception as e:
logger.error(f"❌ Error extracting tracks from chart: {e}")
return jsonify({
"success": False,
"error": str(e),
"tracks": [],
"count": 0
}), 500
@app.route('/api/beatport/genre/<genre_slug>/<genre_id>/top-10', methods=['GET'])
def get_beatport_genre_top_10(genre_slug, genre_id):
"""Get top 10 tracks for a specific Beatport genre"""

@ -11612,6 +11612,7 @@ function setupNewChartItemHandlers(genreSlug, genreId, genreName) {
const chartUrl = item.dataset.chartUrl;
console.log(`🎵 Chart clicked: ${chartName} by ${chartArtist}`);
console.log(`🔗 Chart URL: ${chartUrl}`);
try {
// Create a virtual chart data object
@ -11620,8 +11621,18 @@ function setupNewChartItemHandlers(genreSlug, genreId, genreName) {
showToast(`Loading ${chartName}...`, 'info');
// For demonstration, we'll use the genre tracks as chart content
const response = await fetch(`/api/beatport/genre/${genreSlug}/${genreId}/tracks?limit=20`);
// Use the new chart extraction endpoint with the actual chart URL
const response = await fetch('/api/beatport/chart/extract', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
chart_url: chartUrl,
chart_name: chartName,
limit: 100
})
});
if (!response.ok) {
throw new Error(`Failed to fetch chart content: ${response.status}`);
}
@ -11814,9 +11825,8 @@ function setupGenreChartItemHandlers(genreSlug, genreId, genreName) {
const chartUrl = item.dataset.chartUrl;
console.log(`🎵 Chart clicked: ${chartName} by ${chartArtist}`);
console.log(`🔗 Chart URL: ${chartUrl}`);
// For now, we'll create a virtual playlist from this chart
// This would eventually fetch the actual chart contents from Beatport
try {
// Create a virtual chart data object
const chartHash = `individual_chart_${genreSlug}_${Date.now()}`;
@ -11824,9 +11834,18 @@ function setupGenreChartItemHandlers(genreSlug, genreId, genreName) {
showToast(`Loading ${chartName}...`, 'info');
// For demonstration, we'll use the genre tracks as chart content
// In a real implementation, this would fetch the specific chart tracks
const response = await fetch(`/api/beatport/genre/${genreSlug}/${genreId}/tracks?limit=20`);
// Use the new chart extraction endpoint with the actual chart URL
const response = await fetch('/api/beatport/chart/extract', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
chart_url: chartUrl,
chart_name: chartName,
limit: 100
})
});
if (!response.ok) {
throw new Error(`Failed to fetch chart content: ${response.status}`);
}

Loading…
Cancel
Save