#
# $Id: rfc822parser.icn,v 1.1 2004/02/12 17:07:56 rparlett Exp $
#

package mail

link scan, caseless

import util

global ctl_char, space, specials, atom_char, ctext_char, field_name_chars,
       qtext_char, dtext_char, lwsp_char, days, months, tz_chars, tspecials, atom_char_1521

#
# Initialize the parser
# @p
procedure init_parser()
   ctl_char := &ascii[1:33] ++ &ascii[128]
   space := ' '
   specials := '()<>@,;:\\\".[]'
   atom_char := &ascii -- ctl_char -- space -- specials
   qtext_char := &ascii -- '\"\\\r'
   dtext_char := &ascii -- '[]\\\r'
   ctext_char := &ascii -- '()\\\r'
   field_name_chars := &ascii -- ctl_char -- space -- ':'
   lwsp_char := ' \t'
   days := ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
   months := ["Jan", "Feb",  "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
   tz_chars := '+-' ++ &digits ++ &ucase
   # RCF 1521 defs
   tspecials := '()<>@,;:\\\"/[]?='
   atom_char_1521 := &ascii -- ctl_char -- space -- tspecials
end

class RFC822Parser : Error()
   #
   # The first four parse objects are the fundamental lexical components, other
   # than inidividual special characters (see rfc822).  Therefore, they must
   # be preceded by a call to next_token, to move the position to the correct
   # point to parse them.
   #

   method parse_error(s)
      return error("Error parsing '" || &subject || "': " || s)
   end

   #
   # qtext = <any CHAR excepting <">, "\" & CR, and including 
   #         linear-white-space>
   # quoted-pair =  "\" CHAR
   # quoted-string = <"> *(qtext/quoted-pair) <">
   #
   # Must be preceded by a call to next_token()
   #
   method parse_quoted_string()
      local res

      res := move(1)
      repeat {
         if any('\\') then {
            res ||:= move(2) | return parse_error("Unterminated quoted string")
         } else if any(qtext_char) then
            res ||:= tab(many(qtext_char))
         else if any('\"') then {
            return res || move(1)
         } else
            # Missing closing quote or illegal char
            return parse_error("Missing closing quote or illegal char")
      }
   end

   #
   # atom        =  1*<any CHAR except specials, SPACE and CTLs>
   # ie one or more atom_char
   #
   # Must be preceded by a call to next_token()
   #
   method parse_atom()
      return tab(many(atom_char))
   end

   #
   # domain-literal =  "[" *(dtext / quoted-pair) "]"
   # dtext =  <any CHAR excluding "[","]", "\" & CR>
   #
   # Must be preceded by a call to next_token()
   #
   method parse_domain_literal()
      local res
      res := move(1)
      repeat {
         if any('\\') then
            res ||:= move(2) | return parse_error("Premature end of field")
         else if any(dtext_char) then
            res ||:= tab(many(dtext_char))
         else if any(']') then 
            return res || move(1)
         else
            # Missing closing ] or illegal char
            return parse_error("Missing closing ] or illegal char")
      }
   end

   #
   # comment     =  "(" *(ctext / quoted-pair / comment) ")"
   #
   method parse_comment()
      local res
      res := move(1)
      repeat {
         if any('\\') then
            res ||:= move(2) | return parse_error("Premature end of field")
         else if any(ctext_char) then
            res ||:= tab(many(ctext_char))
         else if any('(') then
            res ||:= parse_comment() | fail
         else if any(')') then 
            return res || move(1)
         else
            # Missing closing ) or illegal char
            return parse_error("Missing closing ) or illegal char")
      }
   end

   method next_token()
      repeat {
         tab(many(lwsp_char))
         if any('(') then
            parse_comment() | fail
         else
            return
      }
   end

   #
   # word =  atom / quoted-string
   #
   method parse_word() 
      next_token() | fail
      if any('\"') then
         return parse_quoted_string()
      else
         return parse_atom()
   end

   #
   # local-part  =  word *("." word)             
   #
   method parse_local_part()
      local res

      res := parse_word() | fail
      
      while res ||:= ="." do
         res ||:= parse_word() | fail

      return res
   end

   #
   # domain      =  sub-domain *("." sub-domain)
   #
   method parse_domain()
      local res

      res := parse_sub_domain() | fail

      while res ||:= ="." do
         res ||:= parse_sub_domain() | fail

      return res
   end

   #
   # sub-domain  =  domain-ref / domain-literal
   # domain-ref  =  atom
   #
   method parse_sub_domain()
      next_token() | fail
      if any('[') then
         return parse_domain_literal()
      else
         return parse_atom()
   end

   #
   # phrase =  1*word
   # 
   # In fact this is parsed as just "*word".  This allows
   # parsing of mailboxes such as, for example "<rparlett@xyz.com>" and
   # groups such as :joe@soap.com;
   #
   method parse_phrase()
      local res
      res := ""
      next_token() | fail
      while any('\"' | atom_char) do {
         # Single spaces between words
         if *res > 0 then
            res ||:= " "
         res ||:= parse_word() | fail
         next_token() | fail
      }

      # Strictly, this would be :-
      # if *res > 0 then
      #   return res

      return res
   end

   #
   # addr_spec = local-part "@" domain
   #
   method parse_addr_spec(mb)
      mb.local_part := parse_local_part() | fail
      next_token() | fail
      ="@" | return parse_error("@ expected")
      mb.domain := parse_domain() | fail
      return
   end

   #
   # route       =  1#("@" domain) ":" 
   # where 1#X means '(X *("," X))' 
   #
   method parse_route(mb)
      mb.route := []
      repeat {
         next_token() | fail
         ="@" | return parse_error("@ expected")
         put(mb.route, parse_domain()) | fail
         next_token() | fail
         ="," | break
         next_token() | fail
      }
      next_token() | fail
      =":" | return parse_error(": expected")
      return
   end

   #
   # route-addr  =  "<" [route] addr-spec ">"
   #
   method parse_route_addr(mb)
      ="<" | return parse_error("< expected")
      next_token() | fail
      if any('@') then
         parse_route(mb) | fail
      else
         mb.route := []
      next_token() | fail
      parse_addr_spec(mb) | fail
      next_token() | fail
      =">" | return parse_error("> expected")
      return
   end

   #
   # mailbox = addr-spec | phrase route-addr
   #
   method parse_mailbox_impl(mb)
      /mb := Mailbox()
      #
      # The lookahead is complex, so try parsing as a simple address
      # first, and if that fails try the more complex alternative.
      #
      x := &pos
      if parse_addr_spec(mb) then {
         mb.phrase := ""
         mb.route := []
      } else {
         tab(x)
         mb.phrase := parse_phrase() | fail
         parse_route_addr(mb) | fail
      }

      return mb
   end

   #
   # group =  phrase ":" [#mailbox] ";"
   # #X means empty or X,X,X...
   #    
   method parse_group_impl(group)
      local mb

      /group := Group()
      group.phrase := parse_phrase() | fail
      group.mailboxes := []
      next_token() | fail
      =":" | return parse_error(": expected")

      next_token() | fail
      if any(';') then # empty group
         move(1)
      else {
         repeat {
            mb := parse_mailbox_impl() | fail
            put(group.mailboxes, mb)
            next_token() | fail
            ="," | break
         }
         =";" | return parse_error("; expected")
      }

      return group
   end

   method parse_mailbox_or_group()
      local x, a

      x := &pos
      if a := parse_mailbox_impl() then
         return a
      tab(x)

      return parse_group_impl()
   end

   method parse_address_list_impl(can_be_empty)
      local l
      l := []

      if \can_be_empty then {
         next_token() | fail
         if pos(0) then
            return l
      }

      repeat {
         put(l, parse_mailbox_or_group()) | fail
         next_token() | fail
         ="," | return l
      }
   end

   method parse_mailbox_list_impl(can_be_empty)
      local l
      l := []

      if \can_be_empty then {
         next_token() | fail
         if pos(0) then
            return l
      }

      repeat {
         put(l, parse_mailbox_impl()) | fail
         next_token() | fail
         ="," | return l
      }
   end

   #
   # month =  "Jan"  /  "Feb" /  "Mar"  /  "Apr" /  "May"  /  "Jun" /  
   #         "Jul"  /  "Aug" /  "Sep"  /  "Oct" /  "Nov"  /  "Dec"
   #
   method parse_month()
      local i
      next_token() | fail
      every i := 1 to *months do
         if =months[i] then
            return i
   end

   method parse_day()
      local s
      next_token() | fail
      s := tab(many(&digits)) | return parse_error("digit expected")
      if *s <= 2 then
         return integer(s)
      return parse_error("invalid day: " || s)
   end

   method parse_year()
      local s
      next_token() | fail
      s := tab(many(&digits)) | return parse_error("digit expected")
      if *s = (2 | 4) then
         return integer(s)
      return parse_error("invalid year: " || s)
   end

   method parse_2dig()
      local s
      next_token() | fail
      s := tab(many(&digits)) | return parse_error("digit expected")
      if *s = 2 then
         return integer(s)
      return parse_error("2 digit field expected")
   end

   method parse_zone()
      next_token() | fail
      tab(upto(tz_chars)) | return parse_error("Expected tz char")
      return tab(many(tz_chars)) | return parse_error("Expected tz char")
   end

   #
   # date-time = [ day "," ] date time
   # day =  "Mon"  / "Tue" /  "Wed"  / "Thu"/  "Fri"  / "Sat" /  "Sun"
   # date =  1*2DIGIT month 2DIGIT        
   # time =  hour zone                    
   # hour =  2DIGIT ":" 2DIGIT [":" 2DIGIT]
   #
   method parse_date_time_impl()
      local d, m, y, h, s, t

      t := Time()

      next_token() | fail
      if =(!days) then {
         next_token() | fail
         ="," | return parse_error(", expected")
      }
      d := parse_day() | fail
      m := parse_month() | fail
      y := parse_year() | fail

      hh := parse_2dig() | fail
      next_token() | fail
      =":" | return parse_error(": expected")
      mm := parse_2dig() | fail
      next_token() | fail
      if any(':') then {
         move(1)
         ss := parse_2dig() | fail
      } else
         ss := 0

      s := parse_zone() | fail
      zz := get_known_timezone(s) | Timezone(0, s)

      t.set_fields(y, m, d, hh, mm, ss, zz)

      return t
   end

   #
   # field           =    field-name ":" [field-body] CRLF
   # field-name      =  1*<any CHAR, excluding CTLs, SPACE, and ":">
   # field-body      =   *text [CRLF LWSP-char field-body]
   #
   method parse_field_impl(message)
      local f, v
      next_token() | fail
      f := tab(many(field_name_chars)) | return parse_error("Expect fieldname chars")
      next_token() | fail
      =":" | return parse_error(": expected")
      next_token() | fail
      v := tab(0)
      message.add_header(f, v)
      return
   end

   #
   # encoding := "Content-Transfer-Encoding" ":" mechanism
   #
   # mechanism :=     "7bit"  ;  case-insensitive
   #               / "quoted-printable"
   #               / "base64"
   #               / "8bit"
   #               / "binary"
   #               / x-token
   #
   method parse_content_transfer_encoding_impl()
      local s

      if s := tab(matchcl("7bit"|"quoted-printable"|"base64"|"8bit"|"binary")) then {
         return s
      }

      if match("X-"|"x-") then
         return parse_extension_token()

      return parse_error("Unknown encoding")
   end

   #
   # type :=          "application"     / "audio"
   #               / "image"           / "message"
   #               / "multipart"  / "text"
   #               / "video"           / extension-token
   #               ; All values case-insensitive
   #
   method parse_type()
      local s

      if s := tab(matchcl("application"|"audio"|"image"|"message"|"multipart"|"text"|"video")) then {
         return s
      }

      if match("X-"|"x-") then
         return parse_extension_token()

      return parse_error("Unknown type")
   end

   #
   # disposition-type := "inline"
   #                       / "attachment"
   #                       / extension-token
   #                       ; values are not case-sensitive
   method parse_disposition_type()
      local s

      if s := tab(matchcl("inline"|"attachment")) then {
         return s
      }

      if match("X-"|"x-") then
         return parse_extension_token()

      return parse_error("Unknown Content-disposition type")
   end

   #
   # extension-token :=  x-token / iana-token
   #
   #     iana-token := <a publicly-defined extension token,
   #               registered with IANA, as specified in
   #               appendix E>
   #
   #     x-token := <The two characters "X-" or "x-" followed, with
   #                 no intervening white space, by any token>
   #
   method parse_extension_token()
      s := =("X-"|"x-") | return parse_error("X- or x- expected")
      return s || parse_token_1521()
   end

   #
   # token  :=  1*<any (ASCII) CHAR except SPACE, CTLs,
   # or tspecials>
   #
   # tspecials :=  "(" / ")" / "<" / ">" / "@"
   #             /  "," / ";" / ":" / "\" / <">
   #             /  "/" / "[" / "]" / "?" / "="
   #            ; Must be in quoted-string,
   #            ; to use within parameter values
   #
   method parse_token_1521()
      return tab(many(atom_char_1521))
   end

   #
   # content  :=   "Content-Type"  ":"  type  "/"  subtype  *(";"
   #  parameter)
   #            ; case-insensitive matching of type and subtype
   # parameter := attribute "=" value
   #
   #  attribute := token   ; case-insensitive
   #
   #  value := token / quoted-string
   #
   method parse_content_type_impl(ct)
      local ty, key, val

      next_token() | fail

      /ct := ContentType()

      ty := parse_type() | fail
      next_token() | fail
      ="/" | return parse_error("/ expected")
      next_token() | fail
      st := parse_token_1521() | fail

      ct.set_type(ty)
      ct.set_subtype(st)

      repeat {
         next_token() | fail
         =";" | break
         next_token() | fail
         key := parse_token_1521() | fail
         next_token() | fail
         ="=" | return parse_error("= expected")
         next_token() | fail
         if any('\"') then
            val :=  parse_quoted_string() | fail
         else
            val := parse_token_1521() | fail

         ct.set_parameter(key, val)
      }

      return ct
   end

   #
   # disposition := "Content-Disposition" ":"
   #                    disposition-type
   #                    *(";" disposition-parm)
   #
   #     disposition-type := "inline"
   #                       / "attachment"
   #                       / extension-token
   #                       ; values are not case-sensitive
   #
   #     disposition-parm := filename-parm / parameter
   #
   #     filename-parm := "filename" "=" value;
   #
   method parse_content_disposition_impl(cd)
      local ty, key, val

      next_token() | fail

      /cd := ContentDisposition()

      ty := parse_disposition_type() | fail

      cd.set_type(ty)

      repeat {
         next_token() | fail
         =";" | break
         next_token() | fail
         key := parse_token_1521() | fail
         next_token() | fail
         ="=" | return parse_error("= expected")
         next_token() | fail
         if any('\"') then
            val :=  parse_quoted_string() | fail
         else
            val := parse_token_1521() | fail

         cd.set_parameter(key, val)
      }

      return cd
   end

   method parse_mailbox(s, mb)
      return s ? parse_mailbox_impl(mb)
   end

   method parse_group(s, group)
      return s ? parse_group_impl(group)
   end

   method parse_address_list(s, can_be_empty)
      return s ? parse_address_list_impl(can_be_empty)
   end

   method parse_mailbox_list(s, can_be_empty)
      return s ? parse_mailbox_list_impl(can_be_empty)
   end

   method parse_date_time(s)
      return s ? parse_date_time_impl()
   end

   method parse_field(s, message)
      return s ? parse_field_impl(message)
   end

   method parse_content_type(s, ct)
      return s ? parse_content_type_impl(ct)
   end

   method parse_content_disposition(s, ct)
      return s ? parse_content_disposition_impl(ct)
   end

   method parse_content_transfer_encoding(s)
      return s ? parse_content_transfer_encoding_impl()
   end

   initially()
      initial {
         init_parser()
      }
end