[tracker] data-generators: Create more useful information for music generation



commit 3400c6a781486d69d54939101070e4877cc3c500
Author: Martyn Russell <martyn lanedo com>
Date:   Wed Dec 2 18:09:59 2009 +0200

    data-generators: Create more useful information for music generation

 utils/data-generators/generate-data-for-music.py |  203 +++++++++++++---------
 utils/data-generators/generate-name.py           |  200 +++++++++++++++++++++
 2 files changed, 318 insertions(+), 85 deletions(-)
---
diff --git a/utils/data-generators/generate-data-for-music.py b/utils/data-generators/generate-data-for-music.py
index f935e68..25acb30 100755
--- a/utils/data-generators/generate-data-for-music.py
+++ b/utils/data-generators/generate-data-for-music.py
@@ -1,10 +1,10 @@
 #! /usr/bin/env python
 
+import os
 import sys
 import random
 import urllib
 
-
 artist_UID = {}
 album_UID = {}
 fileid = 0
@@ -20,96 +20,116 @@ def printHeader():
 	f.write("@prefix nie:   <http://www.semanticdesktop.org/ontologies/2007/01/19/nie#>.\n");
 	f.write("@prefix xsd:   <http://www.w3.org/2001/XMLSchema#>.\n\n")
 
-def updatetag(artistid, albumid, trackid, genreid):
+def generate_name():
+        name = os.popen('./generate-name.py').read()
+
+        first_name = ""
+        last_name = ""
+
+ 	for line in name.splitlines():
+                if not first_name:
+                        first_name = line
+                        continue
+
+                if not last_name:
+                        last_name = line
+                        continue
+
+        full_name = '%s %s' % (first_name, last_name)
 
+        return full_name
+
+def update_tag(artistid, artistname, albumid, trackid, genreid):
 	global fileid
 	
-	length = 0
-	length=random.randint(5000,5000000 )
-	song = 'SongTitle [%03u]' % fileid 
-	artist = 'TrkArtist [%03u]' % artistid
-	album = 'TrkAlbum [%03u]' % albumid
-	genre = 'Genre-[%03u]' %genreid
+	length = random.randint(5000,5000000)
+	song = 'SongTitle%03u' % fileid 
+	album = 'Album%03u' % albumid
+	genre = 'Genre%03u' % genreid
 	trackstr = str(artistid) + '/' + str(trackid)
-	fullpath = '/home/abc/d/e/%03u.mp3' %fileid
-	fileid +=1
-	year = '2009'
-
-	size ='%03u' %fileid
-	modified = "2009-07-17T15:18:16"
-	created = "2009-07-17T15:18:16"
+	fullpath = '/home/foo/music/%s/%s/%03u.mp3' % (artistname, album, trackid)
+	fileid += 1
+	year = '%04u' % random.randint(1950, 2010)
+	size = '%03u' % random.randint(3 * 1024, 10 * 1024)
+	modified = "%04u-%02u-%02uT15:18:16" % (random.randint(1950, 2010),
+                                                random.randint(1, 12),
+                                                random.randint(1, 28))
+	created = modified
 	
-	if not artist_UID.has_key(artist):
+	if not artist_UID.has_key(artistname):
                 #print " The new  artist is "+artist
                 UID = str(random.randint(0, sys.maxint))
-                artist_UID[artist] = UID
-                f.write('<urn:uuid:'+UID+'> a nco:Contact; \n')
-		f.write('\tnco:fullname "'+artist+'".\n\n')
-
-
-	else :
-                UID = artist_UID[artist]
+                artist_UID[artistname] = UID
+                f.write('<urn:uuid:' + UID + '> a nco:Contact; \n')
+		f.write('\tnco:fullname "' + artistname + '".\n\n')
+	else:
+                UID = artist_UID[artistname]
 
 	if not album_UID.has_key(album):
                 album_UID[album] = album
-                f.write('<urn:album:'+album+'> a nmm:MusicAlbum; \n')
+                f.write('<urn:album:' + album + '> a nmm:MusicAlbum; \n')
 		
-                if len(UID)>0: f.write('\tnmm:albumArtist <urn:uuid:'+UID+'>;\n')
-                f.write('\tnie:title "'+album+'".\n\n')
+                if len(UID)>0: 
+                        f.write('\tnmm:albumArtist <urn:uuid:' + UID + '>;\n')
 
-	else :
-                UID = artist_UID[artist]
+                f.write('\tnie:title "' + album + '".\n\n')
+	else:
+                UID = artist_UID[artistname]
 
+        f.write('<file://' + urllib.pathname2url(fullpath) + '> a nmm:MusicPiece,nfo:FileDataObject;\n')
+        if len(song) > 0: 
+                f.write('\tnie:title "' + song + '";\n')
 
-        f.write('<file://'+urllib.pathname2url(fullpath)+'> a nmm:MusicPiece,nfo:FileDataObject;\n')
-        if len(song)>0:f.write('\tnie:title "'+song+'";\n')
-        f.write('\tnfo:fileName \"%03u.mp3\";\n' %fileid)
-        f.write('\tnfo:fileLastModified "'+modified+'" ;\n')
-        f.write('\tnfo:fileCreated "'+created+'";\n')
-        f.write('\tnfo:fileSize '+str(size)+';\n')
-        f.write('\tnmm:musicAlbum <urn:album:'+album+'>;\n')
-        f.write('\tnmm:genre "'+genre+'";\n')
-        if len(trackstr)>0:
-        	trackArray=trackstr.split("/")
-                if len(trackArray)>0: f.write('\tnmm:trackNumber '+trackArray[0]+';\n')
+        f.write('\tnfo:fileName \"' + artistname + '.mp3\";\n')
+        f.write('\tnfo:fileLastModified "' + modified + '" ;\n')
+        f.write('\tnfo:fileCreated "' + created + '";\n')
+        f.write('\tnfo:fileSize ' + str(size) + ';\n')
+        f.write('\tnmm:musicAlbum <urn:album:' + album + '>;\n')
+        f.write('\tnmm:genre "' + genre + '";\n')
 
+        if len(trackstr) > 0:
+        	trackArray = trackstr.split("/")
+                if len(trackArray) > 0: 
+                        f.write('\tnmm:trackNumber ' + trackArray[0] + ';\n')
 
-	f.write('\tnmm:length '+str(length)+';\n')
-        f.write('\tnmm:performer <urn:uuid:'+UID+'>.\n\n')
+	f.write('\tnmm:length ' + str(length) + ';\n')
+        f.write('\tnmm:performer <urn:uuid:' + UID + '>.\n\n')
 
+def create_track(artistid, albumid, genreid, settings):
+        artistname = generate_name()
 
+        for trackid in range(1, settings['TitlesPerAlbum'] + 1):
+                update_tag(artistid, artistname, albumid, trackid, genreid)
 
-def create_track( artistid, albumid, genreid, settings):
-    for trackid in range(1, settings['TitlesPerAlbum'] + 1):
-        updatetag(artistid, albumid, trackid, genreid)
         genreid += 1
         if genreid > settings['GenreCount']:
-            genreid = 1
-    return   genreid
+                genreid = 1
 
+        return genreid
 
 def generate(settings):
-    ''' A total of TotalTracks files will be generated.
+        ''' A total of TotalTracks files will be generated.
 	These contain the specified number of albums.'''
-    '''	
-    filepath = settings['OutputDir']
-    try:
+        '''	
+        filepath = settings['OutputDir']
+        try:
     	os.makedirs(filepath)
-    except:
+        except:
         print 'Directory exists'
-    '''
-
-    global album_UID 
-    genreid = 1
-    artistid = 1
-    albumid = 0
-    for artistid in range(1, settings['ArtistCount'] + 1):
-	album_UID = {}
+        '''
+
+        global album_UID 
+        genreid = 1
+        artistid = 1
+        albumid = 0
+
+        for artistid in range(1, settings['ArtistCount'] + 1):
+                album_UID = {}
+
     	for albums in range(1, settings['AlbumCount'] + 1):
-		albumid+=1
+		albumid += 1
         	genreid = create_track(artistid, albumid, genreid, settings)
 
-
 if __name__ == '__main__':
 	settings = {}
 
@@ -117,32 +137,48 @@ if __name__ == '__main__':
 
 	parser = OptionParser()
 
-        parser.add_option("-T", "--TotalTracks", dest='TotalTracks',
-                        help="Specify (mandatory) the total number of files to be generated" , metavar="TotalTracks")
-        parser.add_option("-r", "--ArtistCount", dest='ArtistCount', default=2,
-                        help="Specify (mandatory) the total number of Artists." , metavar="ArtistCount")
-        parser.add_option("-a", "--album-count", dest='AlbumCount', default=5,
-                        help="Specify (mandatory) the number of albums per artist." , metavar="AlbumCount")
-        parser.add_option("-g", "--genre-count", dest='GenreCount', default=10,
-                        help="Specify the genre count" , metavar="GenreCount")
-        parser.add_option("-o", "--output", dest='OutputFileName', default='songlistDirect.ttl',
-                        help="Specify the output ttl filename. \
-			      E.g., -T 2000 -r 25 -a 20 -g 10 -o generated_songs.ttl" , metavar="OutputFileName")
+        parser.add_option("-T", "--TotalTracks", 
+                          dest='TotalTracks',
+                          help="Specify (mandatory) the total number of files to be generated", 
+                          metavar="TotalTracks")
+        parser.add_option("-r", "--ArtistCount", 
+                          dest='ArtistCount', 
+                          default=2,
+                          help="Specify (mandatory) the total number of Artists." , 
+                          metavar="ArtistCount")
+        parser.add_option("-a", "--album-count", 
+                          dest='AlbumCount', 
+                          default=5,
+                          help="Specify (mandatory) the number of albums per artist.", 
+                          metavar="AlbumCount")
+        parser.add_option("-g", "--genre-count", 
+                          dest='GenreCount', 
+                          default=10,
+                          help="Specify the genre count" , 
+                          metavar="GenreCount")
+        parser.add_option("-o", "--output", 
+                          dest='OutputFileName', 
+                          default='songlistDirect.ttl',
+                          help="Specify the output ttl filename. e.g. -T 2000 -r 25 -a 20 -g 10 -o generated_songs.ttl", 
+                          metavar="OutputFileName")
 
 	(options, args) = parser.parse_args()
 	
 	mandatories = ['TotalTracks', 'ArtistCount', 'AlbumCount']  
 	for m in mandatories:  
-	    if not options.__dict__[m]:  
-	         print "\nMandatory options  missing\n"  
-	         parser.print_help()  
-	         sys.exit(-1)  
-
-	settings['TotalTracks'] = int(options.TotalTracks)
-	if settings['TotalTracks'] < (int(options.ArtistCount) * int(options.AlbumCount) ):
+                if not options.__dict__[m]:  
+                        # Set defaults
+                        if m == "TotalTracks":
+                                options.TotalTracks = 5000
+                        elif m == "ArtistCount":
+                                options.ArtistCount = 60
+                        elif m == "AlbumCount":
+                                options.AlbumCount = 60
+
+	settings['TotalTracks'] = options.TotalTracks
+	if settings['TotalTracks'] < (options.ArtistCount * options.AlbumCount):
 		sys.exit('InputError: TotalTracks should be greater than or equal to  ArtistCount * AlbumCount')
 
-
 	settings['TitlesPerAlbum'] = settings['TotalTracks'] / (int(options.ArtistCount) * int(options.AlbumCount))
 	#print 'settings[\'TitlesPerAlbum\'] %d' %settings['TitlesPerAlbum']
 	settings['ArtistCount'] = int(options.ArtistCount)
@@ -150,12 +186,9 @@ if __name__ == '__main__':
 	settings['GenreCount'] = int(options.GenreCount)
 	settings['OutputFileName'] = options.OutputFileName
 
-	print '\n'+str(settings)+'\n'
+        
+	print '\n' + str(settings) + '\n'
 
 	f = open(settings['OutputFileName'], 'w' )
 	printHeader()
 	generate(settings)
-
-
-
-
diff --git a/utils/data-generators/generate-name.py b/utils/data-generators/generate-name.py
new file mode 100755
index 0000000..61450fc
--- /dev/null
+++ b/utils/data-generators/generate-name.py
@@ -0,0 +1,200 @@
+#! /usr/bin/env python
+
+# Context-free grammar random name generator
+# Jeremy Thurgood <firxen gmail com>
+# Highly experimental at present, but sort of working
+
+import random
+import re
+import sys
+
+
+class GrammarError(RuntimeError):
+    pass
+
+class CFNameGen(object):
+    # This should be done using gettext for i18n, but I can't be bothered to figure
+    # out how to do it properly, so I'm using replacement strings for now.
+    stringUndefinedNonTerminal = "Undefined non-terminal \"%(undefinedNonTerminal)s\" in rule \"%(rule)s\"."
+
+    # Regular expression to catch non-terminals, used frequently, so global
+    reNonTerminal = re.compile(r"<(\w+)>")
+
+    def __init__(self, nameGrammar):
+        """Create a namegen object.
+
+        We take a grammar dict, as before the Great Refactoring.
+        """
+
+        self.checkTypes(nameGrammar)
+        self.grammar = nameGrammar
+
+    # checkTypes() is only useful while testing with internally specified grammars.
+    # Once we're parsing an external file it becomes unnecessary since we generate
+    # the data types ourselves instead of asking a human to do it.  As such, error
+    # strings are hardcoded.  Anyone who sees them would be messing around in here
+    # anyway.
+    def checkTypes(self, nameGrammar):
+        """Check given grammar object for correct datatypes.
+        
+        This function is only really necessary while the grammar's still being
+        specified in here.  It will likely disappear when we parse the grammar from a
+        data file.
+        """
+        if not isinstance(nameGrammar, dict):
+            raise GrammarError("Grammar data is not a dictionary!")
+        for rule, rhs in nameGrammar.items():
+            if not isinstance(rhs, list):
+                raise GrammarError("Rule \"%s\" is not a list!" % rule)
+            for option in rhs:
+                if not isinstance(option, str):
+                    raise GrammarError("Rule \"%s\" does not contain only strings!" % rule)
+                
+    # Grammar verification stuff follows.  We can probably make this throw warnings
+    # and correct problems, but that's a job for another day.  Incorrect grammars
+    # probably won't provide useful output anyway.  If this stuff gets big enough
+    # it may be pushed into its own module.
+
+    def checkUndefinedNonTerminals(self, nameGrammar):
+        """Check given grammar for undefined non-terminals.
+        
+        An undefined non-terminal is a non-terminal symbol used in a symbol
+        definition that has no definition of its own and cannot therefore be
+        expanded.  Undefined non-terminals can lead to ugly error messages
+        instead of beautifully generated names.
+        """
+        for rule, rhs in nameGrammar.items():
+            for option in rhs:
+                tempStr = option
+                matchNonTerminal = self.reNonTerminal.search(tempStr)
+                while matchNonTerminal:
+                    if matchNonTerminal.group(1) not in nameGrammar:
+                        return {"undefinedNonTerminal": matchNonTerminal.group(1), "rule": rule}
+                    tempStr = self.reNonTerminal.sub("", tempStr, 1)
+                    matchNonTerminal = self.reNonTerminal.search(tempStr)
+
+    def checkUnproductiveNonTerminals(self, nameGrammar):
+        """Check grammar for possibly unproductive non-terminals.
+        
+        An unproductive non-terminal is a non-terminal symbol that cannot be
+        converted to a terminal symbol in the given grammar.  A good example of this
+        is a non-terminal symbol that includes itself in its definition.
+        
+        This function is currently very basic and should be extended (rewritten?) to
+        allow warnings for _possible_ unproductive non-terminals and errors for
+        _definite_ unproductive non-terminals.  Volunteers?
+        
+        XXX: INCOMPLETE
+        """
+        def recurse(a):
+            if a == 5:
+                return a
+            return recurse(a+1)
+
+        grammarUnchecked = dict([(rule, "".join(rhs)) for (rule, rhs) in nameGrammar.items()])
+        grammarProductive = []
+        finished = False
+        while not finished:
+            print "grammarProductive:"
+            print grammarProductive
+            print "grammarUnchecked:"
+            print grammarUnchecked
+            print
+            finished = True
+            for rule, rhs in grammarUnchecked.items():
+                matchNonTerminal = reNonTerminal.search(rhs)
+                while matchNonTerminal:
+                    matchString = matchNonTerminal.group(1)
+                    if matchString not in grammarProductive:
+                        break
+                    rhs = rhs.replace("<"+matchString+">", "")
+                    finished = False
+                    matchNonTerminal = reNonTerminal.search(rhs)
+                if not matchNonTerminal:
+                    grammarProductive.append(rule)
+                    del grammarUnchecked[rule]
+                    finished = False
+                    continue
+                grammarUnchecked[rule] = rhs
+
+    # More grammar checking functions to come:
+    #   Unused non-terminals
+    # Loop detection would be nice, but currently a little impractical.
+    
+    def checkUnusedNonTerminals(self, nameGrammar):
+        """Check grammar for non-terminals that can never be reached.
+        
+        While unused non-terminals are irrelevant in the generation of sentences,
+        their presence usually implies an error in the grammar.
+        
+        XXX: INCOMPLETE
+        """
+        
+        pass
+
+    # verifyGrammar() uses the above functions to verify the correctness of a
+    # grammar.  This isn't perfect, but it should catch the most common problems.
+    def verifyGrammar(self):
+        error = self.checkUndefinedNonTerminals(self.grammar)
+        if error:
+            return stringUndefinedNonTerminal % error
+        if "name" not in self.grammar:
+            return "Rule \"name\" not present!"
+
+    # Now to the meat of the problem, which is actually almost trivial thanks to
+    # the dictionary data type.  I love python ;-)
+
+    def getName(self):
+        nameStr = random.choice(self.grammar["name"])
+        matchNonTerminal = self.reNonTerminal.search(nameStr)
+        while matchNonTerminal:
+            subStr = random.choice(self.grammar[matchNonTerminal.group(1)])
+            nameStr = self.reNonTerminal.sub(subStr, nameStr, 1)
+            matchNonTerminal = self.reNonTerminal.search(nameStr)
+        return nameStr
+
+
+if __name__ == "__main__":
+    # Main body
+    # Test grammar -- will be read from a file when I decide how to do it properly
+    # with minimum effort (for the user and the code)
+    orkGrammar = {
+        "name": ["<nameStart><nameMiddle0to3><nameEnd>"],
+        "nameMiddle0to3": ["","<nameMiddle>", "<nameMiddle><nameMiddle>", "<nameMiddle><nameMiddle><nameMiddle>"],
+        "nameStart": ["<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsVowel>"],
+        "nameMiddle": ["<nmCons><nmVowel>"],
+        "nameEnd": ["<neCons><neVowel>", "<neCons>", "<neCons>"],
+        "nsCons": ["D", "G", "K", "T", "Gr"],
+        "nmCons": ["d", "g", "k", "t", "r", "s", "z", "kt", "rs", "gr"],
+        "neCons": ["r", "s", "z"],
+        "nsVowel": ["E", "U"],
+        "nmVowel": ["a", "e", "i", "o", "u"],
+        "neVowel": ["a", "u"]
+        }
+    
+    fooGrammar = {
+        "name": ["<nameStart><nameMiddle0to2><nameEnd>"],
+        "nameMiddle0to2": ["","<nameMiddle>", "<nameMiddle><nameMiddle>"],
+        "nameStart": ["<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsVowel>"],
+        "nameMiddle": ["<nmCons><nmVowel>"],
+        "nameEnd": ["<neCons><neVowel>", "<neCons>", "<neCons>"],
+        "nsCons": ["J", "M", "P", "N", "Y", "D", "F"],
+        "nmCons": ["l", "m", "lm", "th", "r", "s", "ss", "p", "f", "mb", "b", "lb", "d", "lf"],
+        "neCons": ["r", "n", "m", "s", "y", "l", "th", "b", "lb", "f", "lf"],
+        "nsVowel": ["A", "Au", "Ei"],
+        "nmVowel": ["a", "e", "i", "o", "u", "au", "oa", "ei"],
+        "neVowel": ["e", "i", "a", "au"]
+        }
+
+    fooGen = CFNameGen(fooGrammar)
+    errorStr = fooGen.verifyGrammar()
+    if errorStr:
+        sys.exit(errorStr)
+    print fooGen.getName()
+
+    orkGen = CFNameGen(orkGrammar)
+    errorStr = orkGen.verifyGrammar()
+    if errorStr:
+        sys.exit(errorStr)
+    print orkGen.getName()
+



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]