Tuesday, 27 November 2012

WebVTT Parser Should Used Parser Specifications

Do Not Use Syntax Specification For Parser

From what I've been reading in the unit test, it seems that the parser was written with the syntax rules in mind for the WebVTT specifications. It should not. It must be written to the parsing specifications. I've only manged to review cue setting generic tests and the align setting tests. I've had to change 10 of 25 tests all of which were throwing errors and shouldn't. I don't really mind that. 5 and 3 of the remaining tests are testing the same thing. Tests are good. What I mind is that I had to rewrite all the comments and documentation because it was all referencing the syntax rules and not the parsing rules.

If you don't want to read the parsing specifications, that's fine. I wrote it all out in pseudo code. Let me re-post it. Obviously much of it needs to be expanded to actually work, and most of that is fine, especially the string handling parts. But otherwise it should work exactly this way. I would expect a number of the current bugs could be solved just be making the parser conform to the parser specifications.

You may want to change the Bad Cue Loop. I put it were it seems to be right with the spec, but there may be a more optimal place for it. It should only run if the cue timings are malformed since that is the only place where an error can by thrown.

We are actually coding in C with the addition of a C++ wrapper.

I edited it a bit to make it look a bit better and fill in some parts.

Parser Pseudo-Code


// Represents a dynamically updating list
interface TextTrackCueList {
  readonly attribute unsigned long length;   // Number of cues
  getter TextTrackCue (unsigned long index);
  TextTrackCue? getCueById(DOMString id);    // By identifier
};

enum AutoKeyword { "auto" };

[Constructor(double startTime, double endTime, DOMString text)]
interface TextTrackCue : EventTarget {
  readonly attribute TextTrack? track;

           attribute DOMString id;              // Identifier
           attribute double startTime;
           attribute double endTime;
           attribute boolean pauseOnExit;
           attribute DOMString vertical;
           attribute boolean snapToLines;
           attribute (long or AutoKeyword) line;
           attribute long position;
           attribute long size;
           attribute DOMString align;
           attribute DOMString text;
  DocumentFragment getCueAsHTML();

           attribute EventHandler onenter;
           attribute EventHandler onexit;
};

interface Node {}

interface InternalNode : Node {
    OrderedList<Node> children;
    OrderedList<String> classNames;
}

interface LeafNode : Node {}

interface ClassNode : InternalNode {}

interface ItalicsNode : InternalNode {}

interface BoldNode : InternalNode {}

interface UnderlineNode : InternalNode {}

interface RubyNode : InternalNode {}

interface RubyTextNode : InternalNode {}

interface VoiceNode : InternalNode {
    attribute String voiceName
}

interface TextNode : LeafNode {
    attribute String text;
}

interface TimestampNode : LeafNode {
    attribute double timestampSeconds;
}

interface Token {}

interface StringToken : Token {
    attribute String value;

interface StartTagToken : Token {
    attribute String tagName;
    attribute OrderedList<String> classes; //Could be done like TextTrackCueList?
    attribute String annotation;

interface EndTagToken : Token {
    attribute String tagName;

interface TimestampTagToken extends Token {
    attribute double value;
}

Method parse (ByteStream byteStreamInput, OrderedList<texttrackcue> output)
   String input = convert asynchronous byteStreamInput to Unicode
   
   replace NULL characters with REPLACEMENT Ccharacters
   replace CARRIAGE RETURN LINE FEED (CRLF) character pairs with single LINE FEED
   replace CARRIAGE RETURN characters with LINE FEED characters
   
   Integer position = start of input
   
   If character as position is BYTE ORDER MARK
      advancePosition(input, position)
   End If
   
   String line
   Boolean alreadyCollectedLine = False
   
   line = collectLine(input, position)
   
   If line has less than 6 characters
      Throw Error
   End If
   
   If line has exactly 6 charaters and is not "WEBVTT"
      Throw Error
   End If
   
   If line has more than 6 characters
   and (first 6 characters not "WEBVTT" or (7th character not SPACE or TAB))
      Throw Error 
   End If
   
   If position is past end of input
      return
   End If
   
   If character as position in input is LINE FEED
      advancePosition(input, position)
   End If
   
   # Header
   Do
      line = collectLine(input, position)
      
      If position is past end of input
         return
      End If
      
      If character as position in input is LINE FEED
         advancePosition(input, position)
      End If
      
      If line contains "-->"
         alreadyCollectedLine = True
         Exit While Loop
      End If
   While line is empty
   
   # Cue Loop
   Loop
      If alreadyCollectedLine is False
         While character in input as position is LINE FEED
            advancePosition(input, position)
         End While
         
         line = collectLine(input, position)
         
         If line is empty
            Exit Loop
         End If
      End If

      TextTrackCue cue = new TextTrackCue()
      
      cue.identifier = empty
      cue.pauseOnExit = False
      cue.writingDirection = horizontal
      cue.snapToLines = True
      cue.linePosition = auto
      cue.textPosition = 50
      cue.size = 100
      cue.alignment = middleAlignment
      cue.text = empty
      
      If line does not contain "-->"
         cue.identifier = line
         
         If position is past end of input
            Exit Loop
         End If
         
         If character as position in input is LINE FEED
            advancePosition(input, position)
         End If
         
         line = collectLine(input, position)
         
         If line is empty
            Exit Loop
         End IF
      End If
      
      alreadyCollectedLine = False
      
      Try
         collectCueTimingsAndSettings(line, cue)
      Catch
         Boolean end = False
         
         # Bad cue loop
         Loop
            If position is past end of input
               end = true
               Exit Loop
            End If
            
            If character as position in input is LINE FEED
               advancePosition(input, position)
            End If
            
            line = collectLine(input, position)
            
            If line contains "-->"
               alreadyCollectedLine = True
               Exit Loop
            End If
            
            If line is empty
               Exit Loop
            End If
         End Loop
         
         If end is true
            Exit Loop
         End If
         
         Continue Loop
      End Try

      String cueText = empty
      
      # Cue text loop
      Loop
         If position is past end of input
            Exit Loop
         End If
         
         If character as position in input is LINE FEED
            advancePosition(input, position)
         End If
         
         line = collectLine(input, position)
         
         If line is empty
            Exit Loop
         End If
         
         If line contains "-->"
            alreadyCollectedLine = True
            Exit Loop
         End If
         
         If cueText is not empty
            cueText += LINE FEED
         End If
         
         cueText += line
      End Loop
      
      # Cue text processing
      cue.text = cueTextDomContruction(parseCueText(cueText))
      
      output append cue
   End Loop
End Method parse

Method advancePosition(String input, Integer position)
   If position is at the end of input and bystream has not ended
      Wait for bytestream to add characters to input
   End If
   
   If bytestream has ended and next position is past end if input
      position = past end of input
   Else
      position = location of next character sin input
   End IF
End Method advancePosition

Function String collectLine(String input, Integer position)
   String result = empty
   
   While position not past end of input and character in input at position not LINE FEED
      result += character in input at position
      advancePosition(input, position)
   End While
   
   return result
End Function collectLine

Method collectCueTimingsAndSettings(String input, TextTrackCue cue)
   String remainder
   Integer position
   
   position = start of input
   
   skipWhitespace(input, position)
   
   cue.startTime = collectTimestamp(input, position)
   
   skipWhitespace(input, position)
   
   If character at position in input is not "-"
      Throw Error
   Else
      position = location of next character in input
   End If
   
   If character at position in input is not "-"
      Throw Error
   Else
      position = location of next character in input
   End If
   
   If character at position in input is not ">"
      Throw Error
   Else
      position = location of next character in input
   End If
   
   skipWhitespace(input, position)
   
   cue.endTime = collectTimestamp(input, position)
   
   String remainder = remainder of input starting at position
   
   parseSettings(remainder, TextTrackCue cue)
End Method

# Defined in http://dev.w3.org/html5/spec/common-microsyntaxes.html#common-parser-idioms
Method skipWhitespace(String input, Integer position)
   While character in input at position is SPACE or TAB OR LINE FEED or FORM FEED or CARRIAGE RETURN
      position = location of next character in input
   End While
End Method

Method parseSettings(String input, TextTrackCue cue)
   OrderedList<String> settings = input split on SPACE and TAB
   
   For Each String setting in settings
      If setting does not contain ":" or first or last character in setting is ":"
         Next setting
      End If
      
      String name = substring of setting between start or setting and first ":"
      
      String value = substring of setting between first ":" and end of setting
      
      Switch (name)
         Case "vertical"
            If value is "rl"
               cue.writingDirection = verticalGrowingLeft
            End If
            
            If value is "lr"
               cue.writingDirection = verticalGrowingRight
            End If
            
            Break
         
         Case "line"
            If value conatains characters other than "-", "%", or "0" through "9"
               Break
            End If
            
            If value does not contaion at least on character between "0" through "9"
               Break
            End If
            
            If any character in value other than the first is "-"
               Break
            End If
            
            If any character in value other than the last is "%"
               Break
            End If
            
            Integer number = parse substring of value excluding trailing "%" as a signed integer
            
            If last character in value is "%" and (number < 0 or number > 100)
               Break
            End If
            
            cue.linePosition = number
            
            If last character in value is "%"
               cue.snapToLines = True
            End If
            
            Break
            
         Case "position"
            If value conatains characters other than "%" or "0" through "9"
               Break
            End If
            
            If value does not contaion at least on character between "0" through "9"
               Break
            End If
            
            If any character in value other than the last is "%"
               Break
            End If
            
            If last character in value is not "%"
               Break
            End If
            
            Integer number = parse substring of value excluding trailing "%" as a signed integer
            
            If number < 0 or number > 100
               Break
            End If
            
            cue.textPosition = number
            
            Break
            
         Case "size"
            If value conatains characters other than "%" or "0" through "9"
               Break
            End If
            
            If value does not contaion at least on character between "0" through "9"
               Break
            End If
            
            If any character in value other than the last is "%"
               Break
            End If
            
            If last character in value is not "%"
               Break
            End If
            
            Integer number = parse substring of value excluding trailing "%" as a signed integer
            
            If number < 0 or number > 100
               Break
            End If
            
            cue.size = number
            
            Break
            
         Case "align"
            If value is "start"
               cue.alignment = startAlignment
            End If
            
            If value is "middle"
               cue.alignment = middleAlignment
            End If
            
            If value is "end"
               cue.alignment = endAlignment
            End If
            
            If value is "left"
               cue.alignment = leftAlignment
            End If
            
            If value is "right"
               cue.alignment = rightAlignment
            End If
            
            Break
      End Switch (name)
   End For Each setting
End Method parseSettings

Function Float collectTimestamp(String input, Integer position)
   Enumerable SignificantUnits
      Minutes
      Hours
   End Enumberable
   
   Integer value1, value2, value3, value4
   String string
   SignificantUnits mostSignificantUnits
   
   mostSignificantUnits = Minutes
   
   If position is past end of input
      Throw Error
   End If
   
   If character as position in input is not "0" through "9"
      Throw Error
   End If
   
   string = collectDigits(input, position)
   
   value1 = parse string to integer
   
   If string not exactly two characters or value > 59 then
      mostSignificantUnits = Hours
   End If
   
   If position is past end of input or character in input at position is not ":"
      Throw Error
   Else
      position = location of next character in input
   End If
   
   string = collectDigits(input, position)
   
   If string not exactly two characters
      Throw Error
   End If
   
   value2 = parse string to integer
   
   If mostSignificantUnits = Hours
   or (position not past end of input and character as position in input is ":")
      If position is past end of input or character in input at position is not ":"
         Throw Error
      Else
         position = location of next character in input
      End If
      
      string = collectDigits(input, position)
      
      If string not exactly two characters
         Throw Error
      End If
      
      value3 = parse string to integer
   Else
      value3 = value2
      value2 = value1
      value1 = 0
   End If
   
   If position is past end of input or character in input at position is not "."
      Throw Error
   Else
      position = location of next character in input
   End If
   
   string = collectDigits(input, position)
   
   If string not exactly three characters
      Throw Error
   End If
   
   value4 = parse string to integer
   
   If value2 > 59 or value3 > 59
      Throw Error
   End If
   
   return value1 * 60 * 60 + value2 * 60 + value3 + value4 / 1000
End Function collectTimestamp

Function String collectDigits(String input, Integer position)
   String result = empty
   
   While position not past end of input and character in input at position is "0" through "9"
      result += character in input at position
      position = location of next character in input
   End While
   
   return result
End Function collectLine

Function OrderedList<Node> parseCueText (String input)
   Integer position = start of input
   OrderedList<Node> result = empty
   InternalNode current = new InternalNode

   Loop
      If position is past End of input
         return new StringToken(result)
      End If
      
      Token token = cueTextTokenizer(input, position)
      
      Switch (typeof(token))
         Case StringToken
            current.children append new TextNode(text: token.value)
            Break
            
         Case StartTagToken
            Switch (token.tagName)
               Case "c"
                  ClassNode node = new ClassNode()
                  appendClassesToNode(node, token)
                  current.children append node
                  current = node
                  Break
               
               Case "i"
                  ItalicsNode node = new ItalicsNode()
                  appendClassesToNode(node, token)
                  current.children append node
                  current = node
                  Break
                  
               Case "b"
                  BoldNode node = new BoldNode()
                  appendClassesToNode(node, token)
                  current.children append node
                  current = node
                  Break
                  
               Case "u"
                  UnderlineNode node = new UnderlineNode()
                  appendClassesToNode(node, token)
                  current.children append node
                  current = node
                  Break
                  
               Case "ruby"
                  RubyNode node = new RubyNode()
                  appendClassesToNode(node, token)
                  current.children append node
                  current = node
                  Break
                  
               Case "rt"
                  If typeof(current) is RubyNode
                     RubyTextNode node = new RubyTextNode()
                     appendClassesToNode(node, token)
                     current.children append node
                     current = node
                  End If
                  Break
               
               Case "v"
                  VoiceNode node = new VoiceNode()
                  appendClassesToNode(node, token)
                  
                  If token.annotation is not null
                     node.annotation = token.annotation
                  else
                     node.annotation = empty
                  End If
                  
                  current.children append node
                  current = node
                  Break
            End Switch (token.tagName)
            
            Break
            
         Case EndTagToken
            If (token.tagName is "c" And typeof(current) is ClassNode)
            Or (token.tagName is "i" And typeof(current) is ItalicsNode)
            Or (token.tagName is "b" And typeof(current) is BoldNode)
            Or (token.tagName is "u" And typeof(current) is UnderlineNode)
            Or (token.tagName is "ruby" And typeof(current) is RubyNode)
            Or (token.tagName is "rt" And typeof(current) is RubyTextNode)
            Or (token.tagName is "v" And typeof(current) is VoiceNode)
               current = parent of current
            else If token.tagName is "ruby" And typeof(current) is RubyTextNode
               current = parent of parent of current
            End If
            
            Break
         
         Case TimestampTagToken
            
      End Switch (token)
   End loop
End Function ParserMain

Method appendClassesToNode(InternalNode node, Token token)
   for each className in token.classes
      If className not empty
         node.classes append className
      End If
   End for
End Method appendClassesToNode

Function Token cueTextTokenizer(String input, Integer position)
   Enumerable TokenizerStates
      dataState
      escapeState
      tagState
      startTagState
      startTagClassState
      startTagAnnotationState
      EndTagState
      timestampTagState
   End Enumerable
   
   TokenizerStates tokenizerState = dataState
   String result = empty
   String buffer = empty
   OrderedList<String> classes = empty
   Character c
   
   loop
      If position is past End of input
         c = End of file marker
      else
         c = character in input indiciated by position
      End If
      
      Switch (tokenizerState)
         Case dataState
            Switch (c)
               Case "&"
                  buffer = c
                  tokenizerState = escapeState
                  Break
               
               Case "<"
                  If result is empty
                     tokenizerState = tagState
                  else
                     return new StringToken(result)
                  End If
                  Break
               
               Case End-OF-FILE MARKER
                  return new StringToken(result)
                  Break
               
               default
                  result += c
            End Switch (c)
            
            Break
         
         Case escapeState
            Switch (c)
               Case "&"
                  result += buffer
                  buffer = c
                  Break
               
               Case "0" to "9"
               Case "a" to "z"
               Case "A" to "Z"
                  buffer += c
                  Break
               
               Case ";"
                  Switch (buffer)
                     Case "&amp"
                        result += "&"
                        Break
                     
                     Case "&lt"
                        result += "<"
                        Break
                     
                     Case "&gt"
                        result += ">"
                        Break
                     
                     Case "&lrm"
                        result += LEFT-TO-RIGHT MARK
                        Break
                     
                     Case "&rlm"
                        result += RIGHT-TO-LEFT MARK
                        Break
                     
                     Case "&nbsp"
                        result += NO-Break SPACE
                        Break
                     
                     default
                        result += buffer + ";"
                  End Switch (buffer)
                  
                  tokenizerState = dataState
                  Break
               
               Case "<"
               Case End-OF-FILE MARKER
                  result += buffer
                  return new StringToken(value: result)
                  Break
               
               default
                  result += buffer
                  result += c
                  tokenizerState = dataState
            End Switch (c)
            
            Break
         
         Case tagState
            Switch (c)
               Case TAB
               Case LINE FEED
               Case FROM FEED
               Case SPACE
                  tokenizerState = startTagAnnotationState
                  Break
                  
               Case "."
                  tokenizerState = startTagClassState
                  Break
               
               Case "/"
                  tokenizerState = EndTagState
                  Break
               
               Case "0" to "9"
                  result = c
                  tokenizerState = timestampTagState
                  Break
                  
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  return new StartTagToken(tagName: empty)
                  Break
               
               default
                  result = c
                  tokenizerState = startTagState
            End Switch (c)
            
            Break
            
         Case startTagState
            Switch (c)
               Case TAB
               Case LINE FEED
               Case SPACE
                  tokenizerState = startTagAnnotationState
                  Break
                  
               Case FROM FEED
                  buffer = c
                  tokenizerState = startTagAnnotationState
                  Break
                  
               Case "."
                  tokenizerState = startTagClassState
                  Break
                  
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  return new StartTagToken(tagName: result)
                  Break
               
               default
                  result += c
            End Switch (c)
            
            Break
         
         Case startTagClassState
            Switch (c)
               Case TAB
               Case LINE FEED
               Case SPACE
                  classes append buffer
                  buffer = empty
                  tokenizerState = startTagAnnotationState
                  Break
                  
               Case FROM FEED
                  classes append buffer
                  buffer = c
                  tokenizerState = startTagAnnotationState
                  Break
                  
               Case "."
                  classes append buffer
                  buffer = empty
                  Break
                  
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  classes append buffer
                  return new StartTagToken(tagName:result, classes: classes)
                  Break
               
               default
                  buffer += c
            End Switch (c)
            
            Break
         
         Case startTagAnnotationState
            Switch (c)               
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  remove leading and trailing space characters from buffer
                  replace sequences of one or more consecutive space characters with a single SPACE
                  return new StartTagToken(tageName: result, classes: classes, annotation: buffer)
                  Break
               
               default
                  buffer += c
            End Switch (c)
            
            Break
         
         Case EndTagState
            Switch (c)
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  return new EndTagToken(tagName: result)
                  Break
               
               default
                  result += c
            End Switch (c)
            
            Break
         
         Case timestampTagState
            Switch (c)
               Case ">"
                  position = location of next character in input
               
               Case End-OF-FILE MARKER
                  return new TimestampTagToken(tagName: result)
                  Break
               
               default
                  result += c
            End Switch (c)
            
            Break   
      End Switch (tokenizerState)
      
      position = location of next character in input
   End loop
End Function cueTextTokenizer

Function Tree cueTextDomContruction(OrderedList<Node> nodes)
   # Unsure how to do this at this post
   # Refer to http://dev.w3.org/html5/webvtt/#webvtt-cue-text-dom-construction-rules
End Function cueTextDomContruction

No comments:

Post a Comment