description_parser.py
Module for trying to parse and retrieve song data from descriptions
artist_from_title(title)
¶
Try to determine an artist by doing a search on the video and try to find the most common element by n number of times looking for the most common substring in a subset of the results from youtube
Source code in mps_youtube/description_parser.py
def artist_from_title(title):
""" Try to determine an artist by doing a search on the video
and try to find the most common element by n number of times looking
for the most common substring in a subset of the results from youtube
"""
query = {}
query['q'] = title
query['type'] = 'video'
query['fields'] = "items(snippet(title))"
query['maxResults'] = 50
query['part'] = "snippet"
results = None#pafy.call_gdata('search', query)['items']
titles = [x['snippet']['title'].upper() for x in results]
alts = {}
for _ in range(100):
random.shuffle(titles)
subset = titles[:10]
string = long_substr(subset).strip()
if len(string) > 3:
alts[string] = alts.get(string, 0) + 1
best_string = None
if len(alts) == 1:
best_string = list(alts.keys())[0].capitalize()
else:
best_guess = 99999
best_string = None
for key in list(alts.keys()):
current_guess = title.upper().find(key)
if current_guess < best_guess:
best_guess = current_guess
best_string = key.capitalize()
best_string = re.sub(r"([^\w]+)$", "", best_string)
best_string = re.sub(r"^([^\w]+)", "", best_string)
return best_string
calculate_certainty(line)
¶
Determine if a line contains a
Source code in mps_youtube/description_parser.py
def calculate_certainty(line):
""" Determine if a line contains a """
certainty_indexes = [
{'regex': r"(?:\(?(?:\d{0,4}:)?\d{0,2}:\d{0,2}\)?(?: - )?){1,2}",
'weight': 1},
{'regex': r"(([\w&()\[\]'\.\/ ]+)([ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+",
'weight': 0.75},
{'regex': r"^([\d]+[. ]+)",
'weight': 1}
]
certainty = 0.0
for method in certainty_indexes:
if re.match(method['regex'], line):
certainty += method['weight']
return certainty / len(certainty_indexes)
has_artist(text)
¶
Determine if the strìng has artist or not
Source code in mps_youtube/description_parser.py
def has_artist(text):
""" Determine if the strìng has artist or not """
regex = r"(?:([\w&()\[\]'\.\/ ]+)(?:[ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+"
return not re.match(regex, text)
is_substr(find, data)
¶
Check if is substring
Source code in mps_youtube/description_parser.py
def is_substr(find, data):
""" Check if is substring """
if len(data) < 1 and len(find) < 1:
return False
for i, _ in enumerate(data):
if find not in data[i]:
return False
return True
long_substr(data)
¶
https://stackoverflow.com/a/2894073
Source code in mps_youtube/description_parser.py
def long_substr(data):
""" https://stackoverflow.com/a/2894073 """
substr = ''
if len(data) > 1 and len(data[0]) > 0:
for i in range(len(data[0])):
for j in range(len(data[0])-i+1):
if j > len(substr) and is_substr(data[0][i:i+j], data):
substr = data[0][i:i+j]
return substr
parse(text, title='Unknown')
¶
Main function
Source code in mps_youtube/description_parser.py
def parse(text, title="Unknown"):
""" Main function"""
# Determine a certainty index for each line
lines = []
for line in text.split('\n'):
lines.append((calculate_certainty(line), line))
# Get average from all strings
certainty_average = sum([x[0] for x in lines]) / len(lines)
# Single out lines with above average certainty index
lines = filter(lambda a: a is not None,
[x if x[0] > certainty_average else None for x in lines])
# Determine if they are artist combo strings or only title
cmbs = []
for line in lines:
is_ac = has_artist(line[1])
cmbs.append(strip_string(line[1], is_ac))
# No or very few tracklists will ommit aritsts or add artist information
# on only a few select number of tracks, therefore we count entries with
# and without artist, and remove the anomalities IF the number of
# anomalities are small enough
counters = {'has': 0, 'not': 0}
for combo in cmbs:
counters['has' if combo[0] else 'not'] += 1
dominant = 'has' if counters['has'] > counters['not'] else 'not'
diff = abs(counters['has'] - counters['not'])
if diff > sum([counters['has'], counters['not']]):
print("Too many anomalities detected")
return []
if dominant == 'has':
cmbs = filter(lambda a: a is not None,
[x if x[0] is not None else None for x in cmbs])
else:
arti = artist_from_title(title)
cmbs = filter(lambda a: a is not None,
[(arti, x[1]) if x[0] is None else None for x in cmbs])
return list(cmbs)
strip_string(text, single=False)
¶
Strip an artist-combo string
Source code in mps_youtube/description_parser.py
def strip_string(text, single=False):
""" Strip an artist-combo string """
# Removes timestamps
ts_reg = r"(?:\(?(?:\d{0,4}:)?\d{1,2}:\d{1,2}\)?(?: - )?){1,2}"
text = re.sub(ts_reg, "", text)
# Removes Tracknumbers.
text = re.sub(r"^([\d]+[. ]+)", "", text)
# Removes starting with non words
text = re.sub(r"^[^\w&()\[\]'\.\/]", "", text, flags=re.MULTILINE)
artist, track = None, None
if not single:
rgex = r"(?:([\w&()\[\]'\.\/ ]+)(?:[ ]?[-]+[ ]?)([\w&()\[\]'\.\/ ]+))+"
artist, track = (re.findall(rgex, text)[0])
else:
track = text
return artist, track