While Wikipedia has a Random page feature, the pages are selected uniformly randomly from the database. As an alternative, I wrote a script to choose pages randomly based on their hit counts for a month; such a set might give a more representative example of how Wikipedia looks to visitors. The hit data for, say, September 2004 can be found here (warning: very large file). Below is an example from the hits so far this month (to 22nd September 2004). If you would like a set, just send me a message and tell me a Wikipedia page, and I'll run the script for you and paste in the output. — Matt 15:06, 21 Sep 2004 (UTC)
import re from random import * logFile = "/tmp/url_200409.html" maxEntries = None # 10000 numberOfArticles = 100 r1 = re.compile(r'^(\d*)\s*([0-9.]*)%\s*([0-9]*)\s*([0-9.]*)%\s*/wiki/(\S*)$') class ArticlePicker: def __init__(self, logFile, maxEntries = False): self.logFile = logFile self.hitList = [] self.count = 0 self.maxEntries = maxEntries def readLogFile(self): F = open(self.logFile) count = 0 self.hitSum = 0 for l in F: if self.maxEntries and count > self.maxEntries: break try: hits, name = self.parseLine(l) except ValueError: continue count = count + 1 self.hitList.append((hits,name)) self.hitSum += hits self.count = count F.close() self.hitList.sort() self.hitList.reverse() def parseLine(self, line): l = line.strip() m = r1.match(l) if m == None: raise ValueError, "No matches found" (hits, t1, t2, t3, name) = r1.match(l).groups() self.filterOut(hits, name) spaceName = re.sub('_', ' ', name) return int(hits), spaceName def filterOut(self, hits, name): if name == "": raise ValueError # Exclude blank if re.match(r'^\w*:', name): raise ValueError # Exclude namespaces if re.match(r'Main_Page', name): raise ValueError # Exclude main page # Exclude popular oddities if re.match(r'_vti_bin/owssvr.dl|MSOffice/cltreq.asp', name): raise ValueError def selectRandomly(self, N = 1): rHits = [random() * self.hitSum for i in range(N)] outputs = [None] * N numberOfOutputs = 0 totalSoFar = 0 for hits, name in self.hitList: totalSoFar += hits for index in range(N): if not outputs[index] and totalSoFar >= rHits[index]: outputs[index] = hits, name numberOfOutputs += 1 if numberOfOutputs == N: return outputs return outputs # Dump the articles H = ArticlePicker(logFile, maxEntries) H.readLogFile() randomArticles = H.selectRandomly(numberOfArticles) print "==%d randomly-selected articles (weighted by popularity)==" % numberOfArticles for hits, name in randomArticles: print "* %s — (%d hits)" % (name, hits)