server_playground/doc/www.w3.org/2002/12/cal/slurpIcalSpec.py


								#!/usr/bin/python

								""" slurpIcalSpec.py -- extract formal view of iCalendar/vCard specs


								produces XHTML with various typed links


								@@TODO: make each marked up item visible via css

								(unless they're all linked).


								see also: rfc2html in dev.w3.org @@


								"""


								__version__ = '$Id: slurpIcalSpec.py,v 1.27 2005/11/09 23:10:49 connolly Exp $'

								#see also: changelog at end


								import sys

								import re


								class Usage(Exception):

								    """USAGE: python slurpIcalSpec.py NNNN <rfcNNNN.txt >rfcNNNN.html

								    where NNNN is one of the supported RFCS

								    (so far: 2425, 2426, and 2445)

								    """


								# rfcnum: (footerStarters, typos, example_tags)

								Specs = { 2445: (("RFC", "Dawson"),

								                 (('"EVENT"', '"VEVENT"'), # 4.8.7.3 Last Modified

								                  ('   Purpose This value type',

								                   '   Purpose: This value type'),

								                  ('11.  Full Copyright Statement',

								                   '11  Full Copyright Statement'),

								                  ('3.11 Contact for Further Information:',

								                   '3.11 Contact for Further Information'),

								                  ),

								                 ('Example',),

								                 ),

								          2426: (("RFC", "Dawson"),

								                 # a ref is split across lines; put it on one line

								                 (('Hence, this [MIME-',

								                   'Hence, this [MIME-DIR]'),

								                  ('DIR] profile is',

								                   ' profile is'),

								                  ),

								                 ('Type example',),

								                 ),

								          2425: (("RFC", "Howes"),

								                 (),

								                 (),

								                 )

								          }


								def main(argv):

								    if len(argv) == 2:

								        try:

								            rfcnum = int(argv[1])

								            pgbrks, typos, exampleTags = Specs[rfcnum]

								        except (ValueError, KeyError):

								            raise Usage()

								    else:

								        raise Usage()


								    fp = sys.stdin

								    lines = depaginate(fp, pgbrks, typos)

								    sections = bySection(lines)


								    head = sections.next()

								    #print >>sys.stderr, "RFC header:", `head`

								    title, rfcnum, category, date, authlines = titleEtc(head)

								    print htmlTop(title, rfcnum, category, date, authlines, exampleTags)


								    w = sys.stdout.write


								    sections = list(sections)

								    refs = findRefs(sections)


								    for sec in sections:

								        if sec[0].find("Table of Contents") >= 0:

								            tocSect(w, sec)

								        elif sec[0][0].isdigit():

								            numSect(w, sec, refs)

								        else:

								            flowSect(w, sec, refs)


								    print htmlBot()


								def depaginate(fp, footerStarters, typos):

								    """undo RFC pagination: generate sequence of lines

								    skip lines at top of page


								    'Each page must be limited to 58 lines followed by a form feed on a

								     line by itself.'

								      --Instructions to RFC Authors

								        Postel & Reynolds Oct 1997

								        http://www.ietf.org/rfc/rfc2223

								    """


								    sv = []

								    top = 1


								    while 1:

								        line = fp.readline()

								        if not line: break


								        line = line.rstrip("\r\n")


								        for err, fix in typos:

								            if err in line:

								                line = line.replace(err, fix)


								        for s in footerStarters:

								            if line.startswith(s):

								                line = "\f"

								                break


								        if line == "\f":

								            sv = [''] # leave a blank line where the pagebreak was

								            top = 1

								            continue


								        if len(line):

								            if sv:

								                for l in sv:

								                    yield l

								                sv = []

								            yield line

								            top = 0

								        else:

								            if not top:

								                sv.append(line)


								def bySection(lines):

								    sec = []

								    for l in lines:


								        if len(l):

								            if l.startswith(" ") or \

								                   l.startswith("Request for Comments") or \

								                   l.startswith("Category:") or \

								                   l.find(":") >= 0 or \

								                   l.find("<") >= 0 or \

								                   l.find("--") >= 0:

								                sec.append(l)

								            else:

								                if len(sec):

								                    yield sec

								                sec = []

								                sec.append(l)

								        else:

								            sec.append(l)

								    if len(sec):

								        yield sec


								def titleEtc(lines):

								    sep = None

								    for idx in range(0, len(lines)):

								        if lines[idx].strip() == '':

								            sep = idx

								            break

								    if sep is None:

								        raise ValueError, \

								              "no blank line separating header from title: " + `lines`

								    title = ' '.join(map(lambda l: l.strip(), lines[sep+1:])).strip()

								    rfcnum = lines[1][:40].strip().split()[-1]

								    category = lines[2][:40].strip().split(':')[1].strip()

								    date = lines[sep-1].strip()

								    authlines = []

								    for idx in range(0, sep-1):

								        authlines.append(lines[idx][40:].strip())


								    return title, rfcnum, category, date, authlines


								def findRefs(sections):

								    for sec in sections:

								        if not sec[0].strip().endswith("References"): continue


								        refs = []

								        state = ''

								        for ln in sec[1:]:

								            ln = ln.strip()

								            if ln == '':

								                state = ''

								                continue

								            if state == '':

								                junk, ln = ln.split('[', 1)

								                ref, ln = ln.split(']', 1)

								                #print >>sys.stderr, "found ref:", ref

								                refs.append(ref)


								                state = 'inref'

								        return refs

								    raise ValueError, "no References section found"


								def htmlTop(title, rfcnum, category, date, authlines, exampleTags):

								    css = ""

								    for t in exampleTags:

								        css = css + "dd.%s {   border-style: solid;  border-color: #d0dbe7;  border-width: 0 0 0 .25em;  padding-left: 0.5em;\n" % asClass(t)


								    html = """

								<html xmlns='http://www.w3.org/1999/xhtml'>

								<head profile='http://www.w3.org/2003/g/data-view'>

								  <title>%s</title>

								  <link rel="transformation"

								    href="http://www.w3.org/2002/12/cal/webize2445.xsl"/>

								  <style type="text/css">

								  %s

								  </style>

								</head>

								<body>

								<table>

								<tr><td>Network Working Group<br />

								Request for Comments: %s<br />

								Category: %s<br />

								</td>

								<td>%s</td>

								</tr>

								<tr><td>&#160;</td><td>%s</td></tr>

								</table>

								<h1>%s</h1>

								<address>

								$Revision: 1.27 $ of $Date: 2005/11/09 23:10:49 $

								derived from <a href="http://www.ietf.org/rfc/rfc%s.txt">rfc%s.txt</a>

								and enhanced

								for <a href="http://www.w3.org/2004/01/rdxh/spec">gleaning formal description</a>

								using <a href="slurpIcalSpec.py">slurpIcalSpec.py</a>

								by <a href="http://www.w3.org/People/Connolly/">Dan Connolly</a>

								</address>


								""" % (title, css, rfcnum, category, '<br />'.join(authlines), date, title,

								       rfcnum, rfcnum)


								    return html


								def htmlBot():

								    return "</body></html>"


								def flowSect(w, lines, refs):

								    w("<div><h2>%s</h2>\n" % (lines[0],))

								    flowSectRest(w, lines, refs)


								def flowSectRest(w, lines, refs):

								    p = 0

								    for l in lines[1:]:

								        if l.strip():

								            if not p: w("<p>\n")

								            bodyText(w, l, refs)

								            w("\n")

								            p = 1

								        else:

								            if p: w("</p>\n")

								            p = 0

								    if p: w("</p>\n")

								    w("</div>")


								def tocSect(w, lines):

								    w("<div><h2>%s</h2>\n" % (lines[0],))

								    w("<ul type='none'>\n")

								    for l in lines[1:]:

								        l = l.strip()

								        if l == '': continue

								        if l[0].isdigit():

								            num, l = l.split(None, 1)

								        else: num = ''

								        head = l.split(".", 1)[0]

								        w("<li><a href='#sec%s'>%s %s</a></li>\n" % (num, num, head))

								    w("</ul></div>")


								def refSect(w, lines):

								    state = ''

								    ref = None

								    dd = None


								    w("<dl>\n")

								    for ln in lines[1:]:

								        ln = ln.strip()

								        if ln == '':

								            if dd: refEntry(w, ref, dd)

								            state = ''

								            continue

								        if state == '':

								            junk, ln = ln.split('[', 1)

								            ref, dd = ln.split(']', 1)

								            dd = dd + "\n"

								            state = 'inref'

								        elif state == 'inref':

								            dd = dd + ln + "\n"


								    if dd: refEntry(w, ref, dd)


								    w("</dl>\n")

								    w("</div>\n")


								def refEntry(w, ref, dd):

								    """write a reference entry


								    >>> import StringIO

								    >>> w = StringIO.StringIO()

								    >>> refEntry(w.write, 'IMIP', 'Dawson, F., Mansour, S. and S. Silverberg, "iCalendar Message-based Interoperability Protocol (IMIP)", RFC 2447, November 1998.'); w.getvalue()

								    "<dt id='ref_IMIP'>[IMIP]</dt>\n<dd>Dawson, F., Mansour, S. and S. Silverberg, <cite><a href='http://www.ietf.org/rfc/rfc2447'>iCalendar Message-based Interoperability Protocol (IMIP)</a></cite>, RFC 2447, November 1998.</dd>\n"


								    """

								    w("<dt id='%s'>[%s]</dt>\n" % (asID(ref), ref))

								    w("<dd>")


								    # try to mark up the title

								    parts = dd.split('"')

								    if len(parts) == 3:

								        before, title, after = parts

								        doChars(w, before)


								        w("<cite>")


								        # try to make it a link

								        href = None

								        if ref.startswith("RFC "):

								            href = rfcAddr(ref.split(' ')[1])

								        else:

								            m = re.search('RFC (\d\d\d\d?)', dd)

								            if m:

								                href = rfcAddr(m.group(1))

								            else:

								                m = re.search(r'((http|ftp)://[^ ,]+)', dd)

								                if m:

								                    href = m.group(1)

								        if href:

								            w("<a href='%s'>" % href)

								            doChars(w, title)

								            w("</a>")

								        else:

								            doChars(w, title)

								        w("</cite>")

								        doChars(w, after)

								    else:

								        doChars(w, dd)


								    w("</dd>\n")


								def rfcAddr(num):

								    return 'http://www.ietf.org/rfc/rfc%s' % num


								def asID(ref):

								    """turn a reference label into an ID


								    >>> asID('VCARD')

								    'ref_VCARD'

								    >>> asID('RFC 1872')

								    'ref_RFC_1872'

								    """


								    return 'ref_' + ref.replace(' ', '_')


								def numSect(w, lines, refs):

								    num, head = lines[0].split(None, 1)


								    w("<div><h2 id='sec%s'>%s %s</h2>\n" % (num, num, head))


								    #print >>sys.stderr, "numSect:", num, head


								    if head == "References":

								        refSect(w, lines)

								    elif head == 'Full Copyright Statement' or \

								             head == 'Acknowledgements' or \

								             head == 'Acknowledgments' or \

								             head == 'Abstract':

								        flowSectRest(w, lines, refs)

								    elif lines[2].startswith("   Property Name:"):

								        doStructuredSection(w, lines, refs, "Property", 'Property')

								    elif lines[2].startswith("   Value Name:") or \

								         lines[2].startswith("     Value Name:"): # 4.8.3 indented oddly

								        doStructuredSection(w, lines, refs, "Value")

								    elif lines[2].startswith("   Component Name:"):

								        doStructuredSection(w, lines, refs, "Component", 'Class')

								    elif lines[2].startswith("   Parameter Name:"):

								        doStructuredSection(w, lines, refs, "Parameter", 'Property')

								    elif lines[2].startswith("   To: ietf-mime-directory@imc.org"):

								        doStructuredSection(w, lines, refs, "Type")

								    else:

								        w("<pre>")

								        for l in lines[1:]:

								            bodyText(w, l, refs)

								            w("\n")

								        w("</pre>\n</div>\n")


								def bodyText(w, txt, refs):

								    """ write body text, linking refs


								    >>> import StringIO

								    >>> w = StringIO.StringIO()

								    >>> bodyText(w.write, 'abc [def] ghi', ['def']); w.getvalue()

								    "abc <a href='#ref_def'>[def]</a> ghi"

								    """


								    for part in txt.split('['):

								        for ref in refs:

								            if part.startswith(ref + ']'):

								                junk, part = part.split(']')

								                w("<a href='#%s'>[%s]</a>" % (asID(ref), ref))

								                break

								        doChars(w, part)


								def doChars(w, txt):

								    w(txt.replace("&", "&amp;").replace("<", "&lt;"))


								import string

								# ala iana-token = 1*(ALPHA / DIGIT / "-")

								NAMECHARS = string.letters + string.digits + '-'


								Tags = ('Purpose',

								        'Formal Definition',

								        'Value Type',

								        'Property Parameters',

								        'Property Parameter',

								        'Conformance',

								        'Description',

								        'Format Definition',

								        'Example',


								        'To',

								        'Subject',

								        'Type name',

								        'Type purpose',

								        'Type encoding',

								        'Type value',

								        'Type special notes',

								        'Type example'


								        )


								def doStructuredSection(w, lines, refs, secType, rdfClass=None):

								    w("<dl>\n")


								    secLabel = '%s Name' % secType


								    dt = ''

								    dd = []


								    idx = 2

								    while idx < len(lines):

								        l = lines[idx]


								        if ':' in l:

								            hd, rest = l.lstrip().split(":", 1)

								            if hd == secLabel or hd in Tags:

								                if dd:

								                    subSect(w, secType, rdfClass, dt, dd, refs)

								                    dd = []

								                dt = hd

								                dd.append(rest)

								            else:

								                dd.append(l)

								        else:

								                dd.append(l)

								        idx += 1

								    if dd:

								        subSect(w, secType, rdfClass, dt, dd, refs)

								    w("</dl>\n")

								    w("</div>\n")


								def subSect(w, secType, rdfClass, dt, dd, refs):

								    if dt.endswith(" Name") or dt == 'Type name':

								        name = ''.join(dd).strip()


								        # VEVENT is quoted extraneously

								        if name[0] == '"':

								            name = name[1:-1]

								            dd[0] = name

								        if name.startswith("Any property name with"):

								            name = "X-"

								        elif rdfClass:

								            name = camelCase(name, rdfClass == 'Class')

								        else:

								            name = secType + "_" + name

								        w("<dt id='%s'>%s</dt>\n" % (name, dt))

								    else:

								        w("<dt>%s</dt>\n" % (dt,))


								    w("<dd class='%s'>" % (asClass(dt),))


								    if dt == 'Value Type':

								        rest = dd[0]

								        rel='value-type'

								        if '.' in rest:

								            name, rest = rest.split('.', 1)

								            name = name.strip()

								            rest = '.' + rest

								        else:

								            name = rest.strip()

								            rest = ''


								        if name.startswith("The default"):

								            rel='default-value-type'

								            txt = name

								            name = txt.split()[-1]

								            w(txt[:-len(name)])


								        if 'separated' in rest:

								            rel = 'list-of'

								        w("<a rel='%s' href='#Value_%s'>%s</a> <pre>   %s\n"

								          % (rel, name, name, rest))


								        for l in dd[1:]:

								            l = tokenRefs(w, l, 'allowed-type',

								                          {'DATE': 'Value_DATE',

								                           'DATE-TIME': 'Value_DATE-TIME',

								                           'PERIOD': 'Value_PERIOD',

								                           'BINARY': 'Value_BINARY'})

								            bodyText(w, l, refs)

								            w("\n")


								    else:

								        tokens = None

								        rel = None


								        if secType == "Property" and \

								                        (dt == 'Conformance' or dt == 'Description'):

								            rel = 'applies-to'

								            tokens = {'VEVENT': 'Vevent',

								                      'VTODO': 'Vtodo',

								                      'VJOURNAL': 'Vjournal',

								                      'VFREEBUSY': 'Vfreebusy',

								                      'VTIMEZONE': 'Vtimezone',

								                      'VALARM': 'Valarm'

								                      }

								        elif dt == 'Description' and secType == "Component":

								            rel = 'def'

								            tokens = {

								                'STANDARD': 'standard',

								                'DAYLIGHT': 'daylight',

								                }

								        elif dt == 'Description' and secType == "Value":

								            rel = 'def'

								            tokens = {

								                'FREQ': 'freq',

								                'UNTIL': 'until',

								                'COUNT': 'count',

								                'INTERVAL': 'interval',

								                'BYSECOND': 'bysecond',

								                'BYMINUTE': 'byminute',

								                'BYHOUR': 'byhour',

								                'BYDAY': 'byday',

								                'BYMONTHDAY': 'bymonthday',

								                'BYYEARDAY': 'byyearday',

								                'BYWEEKNO': 'byweekno',

								                'BYMONTH': 'bymonth',

								                'BYSETPOS': 'bysetpos',

								                'WKST': 'wkst',

								                }


								        w("<pre>   ")


								        for l in dd:

								            if tokens: l = tokenRefs(w, l, rel, tokens)

								            bodyText(w, l, refs)

								            w("\n")


								    w("</pre>\n</dd>\n")


								def asClass(t):

								    """return heading tag t as a class name

								    """

								    return t.replace(" ", '')


								def tokenRefs(w, l, rel, tokens):

								    pat = re.compile('|'.join(tokens.keys()))

								    seen = {}

								    while l:

								        m = pat.search(l)

								        if not m: break


								        doChars(w, l[:m.start()])

								        t = l[m.start():m.end()]

								        if rel == 'def' and not seen.has_key(tokens[t]):

								            w('<a id="%s" rel="%s" href="#%s">%s</a>' % (tokens[t], rel,

								                                                         tokens[t], t))

								            seen[tokens[t]] = 1

								        else:

								            w('<a rel="%s" href="#%s">%s</a>' % (rel, tokens[t], t))

								        l = l[m.end():]

								    return l


								def camelCase(n, initialCap=0):

								    words = map(lambda w: w.lower(), n.split('-'))


								    def ucfirst(w):

								        return w[0].upper() + w[1:]


								    if initialCap:

								        return ''.join(map(ucfirst, words))

								    else:

								        return words[0] + ''.join(map(ucfirst, words[1:]))


								def _test():

								    import doctest

								    doctest.testmod()


								if __name__ == '__main__':

								    if '--test' in sys.argv:

								        _test()

								    else:

								        try:

								            main(sys.argv)

								        except Usage, e:

								            print >>sys.stderr, e.__doc__


								# $Log: slurpIcalSpec.py,v $

								# Revision 1.27  2005/11/09 23:10:49  connolly

								# - changed the way duration values are modelled

								#     The iCalendar DURATION value type is actually more than just a

								#     XMLSchema.duration; it also has a RELATED parameter.

								#     So for

								#       TRIGGER;VALUE=DURATION;RELATED=START:-PT15M

								#     we'll write

								#       { ?E cal:trigger [ rdf:value "-PT15M"^^xsdt:duration;

								#                          cal:related "START"] }

								#

								# - fixed test data to have rdf:datatype on integer

								#   values, to match the schema (which matches the RFC)

								#

								# - fixed schema to show DATE-TIME properties (dtstart, ...)

								#   as DatatypeProperties

								#   (there are little/no tests for PERIOD; beware)

								#

								# - scraped more details about property parameters (e.g. partstat, cn,

								#   cutype, ...) and rrule parts (freq, interval, ...) from the RFC so

								#   that they show up as links in the hypertext version and as RDF

								#   properties in the schema.  likewise timezone components (standard,

								#   daylight)

								#  - side effect: added some whitespace in rfc2445.html

								#

								# - demoted x- properties

								#  - removed x- properties from .rdf versions of test data

								#    this allows the round-trip tests to pass

								#  - fromIcal.py doesn't output them unless you give the --x option

								#

								# - added Makefile support for consistency checking with pellet

								#

								# - demoted blank line diagnostic in fromIcal.py to a comment

								#

								# - silenced some left-over debug diagnostics in slurpIcalSpec.py

								#

								# - fixed test/test-created.rdf; added it to fromIcalTest.py list

								#

								# Revision 1.26  2005/07/22 21:14:32  connolly

								# remove : from iCalendar heading

								#

								# Revision 1.25  2005/07/22 21:00:00  connolly

								# - added support for RFC2425, which has

								#  - numbered Abstract and TOC

								#  - examples that start in column 1

								#

								# Revision 1.24  2005/07/22 20:42:12  connolly

								# - handle VCARD structured section tags

								# - working on example extraction; started with CSS style

								# - no bullets on TOC items; just the numbers

								# - handle a ref split across lines in VCARD as a couple typos

								#

								# Revision 1.23  2005/07/22 19:51:28  connolly

								# - parameterize RFC-specific bits so it works for RFC2426 also

								# - factor out typo handling

								# - take RFC number on command line; write diagnostic for incorrect usage

								#

								# Revision 1.22  2005/07/22 19:28:18  connolly

								# - mark up titles in bibliography; make links to RFCs

								# - render copyright, acks sections flowed rather than pre

								# - fix extra . at end of section ID

								#

								# Revision 1.21  2005/07/22 18:49:58  connolly

								# - handle references in 2 passes

								#  - 1st pass to find ref labels in refs section

								#  - 2nd pass to format references from the body and the bibliography

								# - added some unit tests and a --test option

								# - handle unnumbered overview section in TOC

								#

								# Revision 1.20  2004/02/29 14:52:00  connolly

								# new grddl names

								#

								# Revision 1.19  2004/02/12 06:31:23  connolly

								# fix EVENT to VEVENT typo

								#

								# Revision 1.18  2004/02/08 03:30:54  connolly

								# allow or odd indentation of 4.3.8 Integer

								#

								# Revision 1.17  2004/02/08 00:06:03  connolly

								# find domain info in Descriptions of properties as well as Conformance

								#

								# Revision 1.16  2004/02/07 06:30:12  connolly

								# take out broken conformance links to Vcalendar

								#

								# Revision 1.15  2004/02/07 06:02:02  connolly

								# - links from property conformance subsections to components

								#

								# Revision 1.14  2004/02/07 05:31:21  connolly

								# - handle Property Name: Any ... X-

								# - add purposes to formal schema as rdfs:comment

								#

								# Revision 1.13  2004/02/07 05:21:33  connolly

								# - simplify subSect

								# - fix a typo in RFC 2445

								#

								# Revision 1.12  2004/02/07 04:55:50  connolly

								# - refactored doStructuredSection to collect dd lines

								# - use doChars() to fix a bug noted in a comment

								# - removed deblank (dead code)

								# - removed list(lines) (debugging code)

								#

								# Revision 1.11  2004/02/07 04:30:41  connolly

								# - use generators for depagination, section splitting

								# - cite RFC guidelines RFC

								# - factor out some hard-coded strings

								#

								# Revision 1.10  2004/02/07 02:39:10  connolly

								# doStructuredSection was getting out of hand;

								# refactored it before working on Conformance section

								#

								# Revision 1.9  2004/02/07 00:04:37  connolly

								# find more value type info; turn into allowed-type links

								#

								# Revision 1.8  2004/02/01 07:43:16  connolly

								# recognize (though do not fully handle) value types with defaults

								#

								# Revision 1.7  2004/02/01 06:55:11  connolly

								# add provenance in address element

								#

								# Revision 1.6  2004/01/30 01:15:39  connolly

								# fixed case/hypenation

								#

								# Revision 1.5  2004/01/29 19:40:47  connolly

								# added profile for GRDDL

								#

								# Revision 1.4  2004/01/29 16:49:32  connolly

								# first steps towards gleaning a schema from RFC2445 via XHTML, XSLT

								#

								# Revision 1.3  2004/01/28 10:29:46  connolly

								# handle (some cases) of prose in the Value Type field

								# handle more section types

								#

								# Revision 1.2  2004/01/28 10:02:03  connolly

								# groks quite a bit more structure

								#

								# Revision 1.1  2004/01/28 08:54:24  connolly

								# produces pretty reasonable XHTML

								#