403 lines
11 KiB
Go
403 lines
11 KiB
Go
package ytdl
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const youtubeBaseURL = "https://www.youtube.com/watch"
|
|
const youtubeEmbededBaseURL = "https://www.youtube.com/embed/"
|
|
const youtubeVideoEURL = "https://youtube.googleapis.com/v/"
|
|
const youtubeVideoInfoURL = "https://www.youtube.com/get_video_info"
|
|
const youtubeDateFormat = "2006-01-02"
|
|
|
|
// VideoInfo contains the info a youtube video
|
|
type VideoInfo struct {
|
|
// The video ID
|
|
ID string `json:"id"`
|
|
// The video title
|
|
Title string `json:"title"`
|
|
// The video description
|
|
Description string `json:"description"`
|
|
// The date the video was published
|
|
DatePublished time.Time `json:"datePublished"`
|
|
// Formats the video is available in
|
|
Formats FormatList `json:"formats"`
|
|
// List of keywords associated with the video
|
|
Keywords []string `json:"keywords"`
|
|
// Author of the video
|
|
Author string `json:"author"`
|
|
// Duration of the video
|
|
Duration time.Duration
|
|
|
|
htmlPlayerFile string
|
|
}
|
|
|
|
// GetVideoInfo fetches info from a url string, url object, or a url string
|
|
func GetVideoInfo(value interface{}) (*VideoInfo, error) {
|
|
switch t := value.(type) {
|
|
case *url.URL:
|
|
return GetVideoInfoFromURL(t)
|
|
case string:
|
|
u, err := url.ParseRequestURI(t)
|
|
if err != nil {
|
|
return GetVideoInfoFromID(t)
|
|
}
|
|
if u.Host == "youtu.be" {
|
|
return GetVideoInfoFromShortURL(u)
|
|
}
|
|
return GetVideoInfoFromURL(u)
|
|
default:
|
|
return nil, fmt.Errorf("Identifier type must be a string, *url.URL, or []byte")
|
|
}
|
|
}
|
|
|
|
// GetVideoInfoFromURL fetches video info from a youtube url
|
|
func GetVideoInfoFromURL(u *url.URL) (*VideoInfo, error) {
|
|
videoID := u.Query().Get("v")
|
|
if len(videoID) == 0 {
|
|
return nil, fmt.Errorf("Invalid youtube url, no video id")
|
|
}
|
|
return GetVideoInfoFromID(videoID)
|
|
}
|
|
|
|
// GetVideoInfoFromShortURL fetches video info from a short youtube url
|
|
func GetVideoInfoFromShortURL(u *url.URL) (*VideoInfo, error) {
|
|
if len(u.Path) >= 1 {
|
|
if path := u.Path[1:]; path != "" {
|
|
return GetVideoInfoFromID(path)
|
|
}
|
|
}
|
|
return nil, errors.New("Could not parse short URL")
|
|
}
|
|
|
|
// GetVideoInfoFromID fetches video info from a youtube video id
|
|
func GetVideoInfoFromID(id string) (*VideoInfo, error) {
|
|
u, _ := url.ParseRequestURI(youtubeBaseURL)
|
|
values := u.Query()
|
|
values.Set("v", id)
|
|
u.RawQuery = values.Encode()
|
|
|
|
resp, err := http.Get(u.String())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("Invalid status code: %d", resp.StatusCode)
|
|
}
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return getVideoInfoFromHTML(id, body)
|
|
}
|
|
|
|
// GetDownloadURL gets the download url for a format
|
|
func (info *VideoInfo) GetDownloadURL(format Format) (*url.URL, error) {
|
|
return getDownloadURL(format, info.htmlPlayerFile)
|
|
}
|
|
|
|
// GetThumbnailURL returns a url for the thumbnail image
|
|
// with the given quality
|
|
func (info *VideoInfo) GetThumbnailURL(quality ThumbnailQuality) *url.URL {
|
|
u, _ := url.Parse(fmt.Sprintf("http://img.youtube.com/vi/%s/%s.jpg",
|
|
info.ID, quality))
|
|
return u
|
|
}
|
|
|
|
// Download is a convenience method to download a format to an io.Writer
|
|
func (info *VideoInfo) Download(format Format, dest io.Writer) error {
|
|
u, err := info.GetDownloadURL(format)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
resp, err := http.Get(u.String())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode < 200 || resp.StatusCode > 299 {
|
|
return fmt.Errorf("Invalid status code: %d", resp.StatusCode)
|
|
}
|
|
_, err = io.Copy(dest, resp.Body)
|
|
return err
|
|
}
|
|
|
|
func getVideoInfoFromHTML(id string, html []byte) (*VideoInfo, error) {
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(html))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
info := &VideoInfo{}
|
|
|
|
// extract description and title
|
|
info.Description = strings.TrimSpace(doc.Find("#eow-description").Text())
|
|
info.Title = strings.TrimSpace(doc.Find("#eow-title").Text())
|
|
info.ID = id
|
|
dateStr, ok := doc.Find("meta[itemprop=\"datePublished\"]").Attr("content")
|
|
if !ok {
|
|
log.Debug("Unable to extract date published")
|
|
} else {
|
|
date, err := time.Parse(youtubeDateFormat, dateStr)
|
|
if err == nil {
|
|
info.DatePublished = date
|
|
} else {
|
|
log.Debug("Unable to parse date published", err.Error())
|
|
}
|
|
}
|
|
|
|
// match json in javascript
|
|
re := regexp.MustCompile("ytplayer.config = (.*?);ytplayer.load")
|
|
matches := re.FindSubmatch(html)
|
|
var jsonConfig map[string]interface{}
|
|
if len(matches) > 1 {
|
|
err = json.Unmarshal(matches[1], &jsonConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
log.Debug("Unable to extract json from default url, trying embedded url")
|
|
var resp *http.Response
|
|
resp, err = http.Get(youtubeEmbededBaseURL + id)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("Embeded url request returned status code %d ", resp.StatusCode)
|
|
}
|
|
html, err = ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// re = regexp.MustCompile("\"sts\"\\s*:\\s*(\\d+)")
|
|
re = regexp.MustCompile("yt.setConfig\\('PLAYER_CONFIG', (.*?)\\);</script>")
|
|
|
|
matches := re.FindSubmatch(html)
|
|
if len(matches) < 2 {
|
|
return nil, fmt.Errorf("Error extracting sts from embedded url response")
|
|
}
|
|
dec := json.NewDecoder(bytes.NewBuffer(matches[1]))
|
|
err = dec.Decode(&jsonConfig)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Unable to extract json from embedded url: %s", err.Error())
|
|
}
|
|
query := url.Values{
|
|
"sts": []string{strconv.Itoa(int(jsonConfig["sts"].(float64)))},
|
|
"video_id": []string{id},
|
|
"eurl": []string{youtubeVideoEURL + id},
|
|
}
|
|
|
|
resp, err = http.Get(youtubeVideoInfoURL + "?" + query.Encode())
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Error fetching video info: %s", err.Error())
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("Video info response invalid status code")
|
|
}
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Unable to read video info response body: %s", err.Error())
|
|
}
|
|
query, err = url.ParseQuery(string(body))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Unable to parse video info data: %s", err.Error())
|
|
}
|
|
args := make(map[string]interface{})
|
|
for k, v := range query {
|
|
if len(v) > 0 {
|
|
args[k] = v[0]
|
|
}
|
|
}
|
|
jsonConfig["args"] = args
|
|
}
|
|
|
|
inf := jsonConfig["args"].(map[string]interface{})
|
|
if status, ok := inf["status"].(string); ok && status == "fail" {
|
|
return nil, fmt.Errorf("Error %d:%s", inf["errorcode"], inf["reason"])
|
|
}
|
|
if a, ok := inf["author"].(string); ok {
|
|
info.Author = a
|
|
} else {
|
|
log.Debug("Unable to extract author")
|
|
}
|
|
|
|
if length, ok := inf["length_seconds"].(string); ok {
|
|
if duration, err := strconv.ParseInt(length, 10, 64); err == nil {
|
|
info.Duration = time.Second * time.Duration(duration)
|
|
} else {
|
|
log.Debug("Unable to parse duration string: ", length)
|
|
}
|
|
} else {
|
|
log.Debug("Unable to extract duration")
|
|
}
|
|
|
|
// For the future maybe
|
|
parseKey := func(key string) []string {
|
|
val, ok := inf[key].(string)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
vals := []string{}
|
|
split := strings.Split(val, ",")
|
|
for _, v := range split {
|
|
if v != "" {
|
|
vals = append(vals, v)
|
|
}
|
|
}
|
|
return vals
|
|
}
|
|
info.Keywords = parseKey("keywords")
|
|
info.htmlPlayerFile = jsonConfig["assets"].(map[string]interface{})["js"].(string)
|
|
|
|
/*
|
|
fmtList := parseKey("fmt_list")
|
|
fexp := parseKey("fexp")
|
|
watermark := parseKey("watermark")
|
|
|
|
if len(fmtList) != 0 {
|
|
vals := []string{}
|
|
for _, v := range fmtList {
|
|
vals = append(vals, strings.Split(v, "/")...)
|
|
} else {
|
|
info["fmt_list"] = []string{}
|
|
}
|
|
|
|
videoVerticals := []string{}
|
|
if videoVertsStr, ok := inf["video_verticals"].(string); ok {
|
|
videoVertsStr = string([]byte(videoVertsStr)[1 : len(videoVertsStr)-2])
|
|
videoVertsSplit := strings.Split(videoVertsStr, ", ")
|
|
for _, v := range videoVertsSplit {
|
|
if v != "" {
|
|
videoVerticals = append(videoVerticals, v)
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
var formatStrings []string
|
|
if fmtStreamMap, ok := inf["url_encoded_fmt_stream_map"].(string); ok {
|
|
formatStrings = append(formatStrings, strings.Split(fmtStreamMap, ",")...)
|
|
}
|
|
|
|
if adaptiveFormats, ok := inf["adaptive_fmts"].(string); ok {
|
|
formatStrings = append(formatStrings, strings.Split(adaptiveFormats, ",")...)
|
|
}
|
|
var formats FormatList
|
|
for _, v := range formatStrings {
|
|
query, err := url.ParseQuery(v)
|
|
if err == nil {
|
|
itag, _ := strconv.Atoi(query.Get("itag"))
|
|
if format, ok := newFormat(itag); ok {
|
|
if strings.HasPrefix(query.Get("conn"), "rtmp") {
|
|
format.meta["rtmp"] = true
|
|
}
|
|
for k, v := range query {
|
|
if len(v) == 1 {
|
|
format.meta[k] = v[0]
|
|
} else {
|
|
format.meta[k] = v
|
|
}
|
|
}
|
|
formats = append(formats, format)
|
|
} else {
|
|
log.Debug("No metadata found for itag: ", itag, ", skipping...")
|
|
}
|
|
} else {
|
|
log.Debug("Unable to format string", err.Error())
|
|
}
|
|
}
|
|
|
|
if dashManifestURL, ok := inf["dashmpd"].(string); ok {
|
|
tokens, err := getSigTokens(info.htmlPlayerFile)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Unable to extract signature tokens: %s", err.Error())
|
|
}
|
|
regex := regexp.MustCompile("\\/s\\/([a-fA-F0-9\\.]+)")
|
|
regexSub := regexp.MustCompile("([a-fA-F0-9\\.]+)")
|
|
dashManifestURL = regex.ReplaceAllStringFunc(dashManifestURL, func(str string) string {
|
|
return "/signature/" + decipherTokens(tokens, regexSub.FindString(str))
|
|
})
|
|
dashFormats, err := getDashManifest(dashManifestURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Unable to extract dash manifest: %s", err.Error())
|
|
}
|
|
|
|
for _, dashFormat := range dashFormats {
|
|
added := false
|
|
for j, format := range formats {
|
|
if dashFormat.Itag == format.Itag {
|
|
formats[j] = dashFormat
|
|
added = true
|
|
break
|
|
}
|
|
}
|
|
if !added {
|
|
formats = append(formats, dashFormat)
|
|
}
|
|
}
|
|
}
|
|
info.Formats = formats
|
|
return info, nil
|
|
}
|
|
|
|
type representation struct {
|
|
Itag int `xml:"id,attr"`
|
|
Height int `xml:"height,attr"`
|
|
URL string `xml:"BaseURL"`
|
|
}
|
|
|
|
func getDashManifest(urlString string) (formats []Format, err error) {
|
|
|
|
resp, err := http.Get(urlString)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("Invalid status code %d", resp.StatusCode)
|
|
}
|
|
dec := xml.NewDecoder(resp.Body)
|
|
var token xml.Token
|
|
for ; err == nil; token, err = dec.Token() {
|
|
if el, ok := token.(xml.StartElement); ok && el.Name.Local == "Representation" {
|
|
var rep representation
|
|
err = dec.DecodeElement(&rep, &el)
|
|
if err != nil {
|
|
break
|
|
}
|
|
if format, ok := newFormat(rep.Itag); ok {
|
|
format.meta["url"] = rep.URL
|
|
if rep.Height != 0 {
|
|
format.Resolution = strconv.Itoa(rep.Height) + "p"
|
|
} else {
|
|
format.Resolution = ""
|
|
}
|
|
formats = append(formats, format)
|
|
} else {
|
|
log.Debug("No metadata found for itag: ", rep.Itag, ", skipping...")
|
|
}
|
|
}
|
|
}
|
|
if err != io.EOF {
|
|
return nil, err
|
|
}
|
|
return formats, nil
|
|
}
|