package ytdl import ( "bytes" "encoding/json" "encoding/xml" "errors" "fmt" "io" "io/ioutil" "net/http" "net/url" "regexp" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" log "github.com/sirupsen/logrus" ) const youtubeBaseURL = "https://www.youtube.com/watch" const youtubeEmbededBaseURL = "https://www.youtube.com/embed/" const youtubeVideoEURL = "https://youtube.googleapis.com/v/" const youtubeVideoInfoURL = "https://www.youtube.com/get_video_info" const youtubeDateFormat = "2006-01-02" // VideoInfo contains the info a youtube video type VideoInfo struct { // The video ID ID string `json:"id"` // The video title Title string `json:"title"` // The video description Description string `json:"description"` // The date the video was published DatePublished time.Time `json:"datePublished"` // Formats the video is available in Formats FormatList `json:"formats"` // List of keywords associated with the video Keywords []string `json:"keywords"` // Author of the video Author string `json:"author"` // Duration of the video Duration time.Duration htmlPlayerFile string } // GetVideoInfo fetches info from a url string, url object, or a url string func GetVideoInfo(value interface{}) (*VideoInfo, error) { switch t := value.(type) { case *url.URL: return GetVideoInfoFromURL(t) case string: u, err := url.ParseRequestURI(t) if err != nil { return GetVideoInfoFromID(t) } if u.Host == "youtu.be" { return GetVideoInfoFromShortURL(u) } return GetVideoInfoFromURL(u) default: return nil, fmt.Errorf("Identifier type must be a string, *url.URL, or []byte") } } // GetVideoInfoFromURL fetches video info from a youtube url func GetVideoInfoFromURL(u *url.URL) (*VideoInfo, error) { videoID := u.Query().Get("v") if len(videoID) == 0 { return nil, fmt.Errorf("Invalid youtube url, no video id") } return GetVideoInfoFromID(videoID) } // GetVideoInfoFromShortURL fetches video info from a short youtube url func GetVideoInfoFromShortURL(u *url.URL) (*VideoInfo, error) { if len(u.Path) >= 1 { if path := u.Path[1:]; path != "" { return GetVideoInfoFromID(path) } } return nil, errors.New("Could not parse short URL") } // GetVideoInfoFromID fetches video info from a youtube video id func GetVideoInfoFromID(id string) (*VideoInfo, error) { u, _ := url.ParseRequestURI(youtubeBaseURL) values := u.Query() values.Set("v", id) u.RawQuery = values.Encode() resp, err := http.Get(u.String()) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("Invalid status code: %d", resp.StatusCode) } body, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, err } return getVideoInfoFromHTML(id, body) } // GetDownloadURL gets the download url for a format func (info *VideoInfo) GetDownloadURL(format Format) (*url.URL, error) { return getDownloadURL(format, info.htmlPlayerFile) } // GetThumbnailURL returns a url for the thumbnail image // with the given quality func (info *VideoInfo) GetThumbnailURL(quality ThumbnailQuality) *url.URL { u, _ := url.Parse(fmt.Sprintf("http://img.youtube.com/vi/%s/%s.jpg", info.ID, quality)) return u } // Download is a convenience method to download a format to an io.Writer func (info *VideoInfo) Download(format Format, dest io.Writer) error { u, err := info.GetDownloadURL(format) if err != nil { return err } resp, err := http.Get(u.String()) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode > 299 { return fmt.Errorf("Invalid status code: %d", resp.StatusCode) } _, err = io.Copy(dest, resp.Body) return err } func getVideoInfoFromHTML(id string, html []byte) (*VideoInfo, error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html)) if err != nil { return nil, err } info := &VideoInfo{} // extract description and title info.Description = strings.TrimSpace(doc.Find("#eow-description").Text()) info.Title = strings.TrimSpace(doc.Find("#eow-title").Text()) info.ID = id dateStr, ok := doc.Find("meta[itemprop=\"datePublished\"]").Attr("content") if !ok { log.Debug("Unable to extract date published") } else { date, err := time.Parse(youtubeDateFormat, dateStr) if err == nil { info.DatePublished = date } else { log.Debug("Unable to parse date published", err.Error()) } } // match json in javascript re := regexp.MustCompile("ytplayer.config = (.*?);ytplayer.load") matches := re.FindSubmatch(html) var jsonConfig map[string]interface{} if len(matches) > 1 { err = json.Unmarshal(matches[1], &jsonConfig) if err != nil { return nil, err } } else { log.Debug("Unable to extract json from default url, trying embedded url") var resp *http.Response resp, err = http.Get(youtubeEmbededBaseURL + id) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("Embeded url request returned status code %d ", resp.StatusCode) } html, err = ioutil.ReadAll(resp.Body) if err != nil { return nil, err } // re = regexp.MustCompile("\"sts\"\\s*:\\s*(\\d+)") re = regexp.MustCompile("yt.setConfig\\('PLAYER_CONFIG', (.*?)\\);") matches := re.FindSubmatch(html) if len(matches) < 2 { return nil, fmt.Errorf("Error extracting sts from embedded url response") } dec := json.NewDecoder(bytes.NewBuffer(matches[1])) err = dec.Decode(&jsonConfig) if err != nil { return nil, fmt.Errorf("Unable to extract json from embedded url: %s", err.Error()) } query := url.Values{ "sts": []string{strconv.Itoa(int(jsonConfig["sts"].(float64)))}, "video_id": []string{id}, "eurl": []string{youtubeVideoEURL + id}, } resp, err = http.Get(youtubeVideoInfoURL + "?" + query.Encode()) if err != nil { return nil, fmt.Errorf("Error fetching video info: %s", err.Error()) } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("Video info response invalid status code") } body, err := ioutil.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("Unable to read video info response body: %s", err.Error()) } query, err = url.ParseQuery(string(body)) if err != nil { return nil, fmt.Errorf("Unable to parse video info data: %s", err.Error()) } args := make(map[string]interface{}) for k, v := range query { if len(v) > 0 { args[k] = v[0] } } jsonConfig["args"] = args } inf := jsonConfig["args"].(map[string]interface{}) if status, ok := inf["status"].(string); ok && status == "fail" { return nil, fmt.Errorf("Error %d:%s", inf["errorcode"], inf["reason"]) } if a, ok := inf["author"].(string); ok { info.Author = a } else { log.Debug("Unable to extract author") } if length, ok := inf["length_seconds"].(string); ok { if duration, err := strconv.ParseInt(length, 10, 64); err == nil { info.Duration = time.Second * time.Duration(duration) } else { log.Debug("Unable to parse duration string: ", length) } } else { log.Debug("Unable to extract duration") } // For the future maybe parseKey := func(key string) []string { val, ok := inf[key].(string) if !ok { return nil } vals := []string{} split := strings.Split(val, ",") for _, v := range split { if v != "" { vals = append(vals, v) } } return vals } info.Keywords = parseKey("keywords") info.htmlPlayerFile = jsonConfig["assets"].(map[string]interface{})["js"].(string) /* fmtList := parseKey("fmt_list") fexp := parseKey("fexp") watermark := parseKey("watermark") if len(fmtList) != 0 { vals := []string{} for _, v := range fmtList { vals = append(vals, strings.Split(v, "/")...) } else { info["fmt_list"] = []string{} } videoVerticals := []string{} if videoVertsStr, ok := inf["video_verticals"].(string); ok { videoVertsStr = string([]byte(videoVertsStr)[1 : len(videoVertsStr)-2]) videoVertsSplit := strings.Split(videoVertsStr, ", ") for _, v := range videoVertsSplit { if v != "" { videoVerticals = append(videoVerticals, v) } } } */ var formatStrings []string if fmtStreamMap, ok := inf["url_encoded_fmt_stream_map"].(string); ok { formatStrings = append(formatStrings, strings.Split(fmtStreamMap, ",")...) } if adaptiveFormats, ok := inf["adaptive_fmts"].(string); ok { formatStrings = append(formatStrings, strings.Split(adaptiveFormats, ",")...) } var formats FormatList for _, v := range formatStrings { query, err := url.ParseQuery(v) if err == nil { itag, _ := strconv.Atoi(query.Get("itag")) if format, ok := newFormat(itag); ok { if strings.HasPrefix(query.Get("conn"), "rtmp") { format.meta["rtmp"] = true } for k, v := range query { if len(v) == 1 { format.meta[k] = v[0] } else { format.meta[k] = v } } formats = append(formats, format) } else { log.Debug("No metadata found for itag: ", itag, ", skipping...") } } else { log.Debug("Unable to format string", err.Error()) } } if dashManifestURL, ok := inf["dashmpd"].(string); ok { tokens, err := getSigTokens(info.htmlPlayerFile) if err != nil { return nil, fmt.Errorf("Unable to extract signature tokens: %s", err.Error()) } regex := regexp.MustCompile("\\/s\\/([a-fA-F0-9\\.]+)") regexSub := regexp.MustCompile("([a-fA-F0-9\\.]+)") dashManifestURL = regex.ReplaceAllStringFunc(dashManifestURL, func(str string) string { return "/signature/" + decipherTokens(tokens, regexSub.FindString(str)) }) dashFormats, err := getDashManifest(dashManifestURL) if err != nil { return nil, fmt.Errorf("Unable to extract dash manifest: %s", err.Error()) } for _, dashFormat := range dashFormats { added := false for j, format := range formats { if dashFormat.Itag == format.Itag { formats[j] = dashFormat added = true break } } if !added { formats = append(formats, dashFormat) } } } info.Formats = formats return info, nil } type representation struct { Itag int `xml:"id,attr"` Height int `xml:"height,attr"` URL string `xml:"BaseURL"` } func getDashManifest(urlString string) (formats []Format, err error) { resp, err := http.Get(urlString) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("Invalid status code %d", resp.StatusCode) } dec := xml.NewDecoder(resp.Body) var token xml.Token for ; err == nil; token, err = dec.Token() { if el, ok := token.(xml.StartElement); ok && el.Name.Local == "Representation" { var rep representation err = dec.DecodeElement(&rep, &el) if err != nil { break } if format, ok := newFormat(rep.Itag); ok { format.meta["url"] = rep.URL if rep.Height != 0 { format.Resolution = strconv.Itoa(rep.Height) + "p" } else { format.Resolution = "" } formats = append(formats, format) } else { log.Debug("No metadata found for itag: ", rep.Itag, ", skipping...") } } } if err != io.EOF { return nil, err } return formats, nil }