From d77bd3189f9afa64951a4ee009e6684edc1f532e Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Tue, 13 May 2025 18:57:17 +0900 Subject: [PATCH 1/3] =?UTF-8?q?fix=20:=20page.goto=EC=97=90=20waitUntil=20?= =?UTF-8?q?=EC=98=B5=EC=85=98=EC=9C=BC=EB=A1=9C=20=EC=9D=B4=EC=8A=88=20?= =?UTF-8?q?=ED=95=B4=EA=B2=B0.=20=EB=A1=9C=EC=A7=81=20=EC=88=98=EC=A0=95?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- packages/crawling/package.json | 6 ++++ packages/crawling/src/crawlYoutube.ts | 47 +++++++++++++++++++++---- packages/crawling/src/findKYByOpen.ts | 2 +- packages/crawling/src/logData.ts | 6 ++++ packages/crawling/src/supabase/getDB.ts | 4 +-- 6 files changed, 57 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 829ab55..09e4652 100644 --- a/.gitignore +++ b/.gitignore @@ -39,5 +39,5 @@ yarn-error.log* *.pem # Crawling -**/logs/*.txt +**/log/*.txt diff --git a/packages/crawling/package.json b/packages/crawling/package.json index 8f69a6a..f068adb 100644 --- a/packages/crawling/package.json +++ b/packages/crawling/package.json @@ -6,6 +6,12 @@ "exports": { ".": "./src/index.js" }, + "scripts": { + "dev-ky-open": "tsx src/findKYByOpen.ts", + "dev-ky-youtube": "tsx src/crawlYoutube.ts", + "lint": "eslint . --ext .ts,.js", + "test": "vitest run" + }, "dependencies": { "@repo/open-api": "workspace:*", "@supabase/supabase-js": "^2.49.1", diff --git a/packages/crawling/src/crawlYoutube.ts b/packages/crawling/src/crawlYoutube.ts index 490caf2..a055bec 100644 --- a/packages/crawling/src/crawlYoutube.ts +++ b/packages/crawling/src/crawlYoutube.ts @@ -5,6 +5,25 @@ import { Song } from "./types"; import { updateDataLog } from "./logData"; import { updateKYDB } from "./supabase/updateDB"; +const successCase: Song[] = []; +const failedCase: Song[] = []; + +// process.on("SIGINT", async () => { +// console.log("프로세스가 종료됩니다. 지금까지의 데이터를 업데이트 중..."); +// console.log("resultData : ", resultData.length); +// const result = await updateKYDB(resultData); + +// console.log(result); +// console.log("프로세스가 종료됩니다. 로그를 기록 중..."); + +// await Promise.all([ +// updateDataLog(successCase, "log/crawlYodutubeSuccess.txt"), +// updateDataLog(failedCase, "log/crawlYoutubeFailed.txt"), +// ]); + +// console.log("로그 기록 완료."); +// }); + const browser = await puppeteer.launch(); const page = await browser.newPage(); @@ -12,12 +31,23 @@ const baseUrl = "https://www.youtube.com/@KARAOKEKY/search"; async function scrapeSongNumber(query: string) { const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`; - await page.goto(searchUrl); + + // page.goto의 waitUntil 문제였음! + await page.goto(searchUrl, { + waitUntil: "networkidle2", + }); const html = await page.content(); const $ = cheerio.load(html); + // id contents 의 첫번째 ytd-item-section-renderer 찾기 - const firstItem = $("#contents ytd-item-section-renderer").first(); + // const firstItem = $("#contents ytd-item-section-renderer").first(); + + // await 안해도 해결! + // await page.waitForSelector("ytd-video-renderer"); + + const firstItem = $("ytd-video-renderer").first(); + // yt-formatted-string 찾기 const title = firstItem.find("yt-formatted-string").first().text().trim(); @@ -30,23 +60,28 @@ async function scrapeSongNumber(query: string) { function extractKaraokeNumber(title: string) { // KY. 찾고 ) 가 올때까지 찾기 - const matchResult = title.match(/KY\.\s*(\d{3,5})\)/); + const matchResult = title.match(/KY\.\s*(\d{2,5})\)/); const karaokeNumber = matchResult ? matchResult[1] : null; return karaokeNumber; } // 사용 -const data = await getKYNULLDB(); -console.log("getKYNULLDB : ", data); +const data = await getKYNULLDB(5000); +console.log("getKYNULLDB : ", data.length); const resultData: Song[] = []; let index = 0; for (const song of data) { - const query = song.title + " - " + song.artist; + const query = song.title + "-" + song.artist; + console.log(song.title, " - ", song.artist); const result = await scrapeSongNumber(query); if (result) { + console.log("success : ", result); resultData.push({ ...song, num_ky: result }); + successCase.push(song); + } else { + failedCase.push(song); } index++; console.log("scrapeSongNumber : ", index); diff --git a/packages/crawling/src/findKYByOpen.ts b/packages/crawling/src/findKYByOpen.ts index 000c172..fcd5c36 100644 --- a/packages/crawling/src/findKYByOpen.ts +++ b/packages/crawling/src/findKYByOpen.ts @@ -69,7 +69,7 @@ for (const song of kyNullData) { // 6079개 업데이트 // 2차 시도 -// 15065개 업데이트, 제목 가수 이름 일치 이슈 +// 15065개 업데이트, 제목 가수 이름 불일치 이슈 console.log(` 총 ${kyNullData.length}곡 중: diff --git a/packages/crawling/src/logData.ts b/packages/crawling/src/logData.ts index 122220a..2041c8b 100644 --- a/packages/crawling/src/logData.ts +++ b/packages/crawling/src/logData.ts @@ -16,6 +16,12 @@ export function updateDataLog(unknownData: T[] | T, filename: string) { }); const logPath = path.join(filename); + const logDir = path.dirname(logPath); // 디렉터리 경로 추출 + + // 디렉터리가 없으면 생성 + if (!fs.existsSync(logDir)) { + fs.mkdirSync(logDir, { recursive: true }); + } if (unknownData instanceof Array) { // 로그 문자열 생성 diff --git a/packages/crawling/src/supabase/getDB.ts b/packages/crawling/src/supabase/getDB.ts index b95ca64..fe2bed4 100644 --- a/packages/crawling/src/supabase/getDB.ts +++ b/packages/crawling/src/supabase/getDB.ts @@ -32,7 +32,7 @@ export async function getJapaneseDB() { return hasJapaneseData; } -export async function getKYNULLDB() { +export async function getKYNULLDB(max: number = 50000) { const supabase = getClient(); // artist 정렬 @@ -53,5 +53,5 @@ export async function getKYNULLDB() { } }); - return isKYNULLData; + return isKYNULLData.slice(0, max); } From 3e4509fa02670618a43b6f6473d00281618d9f95 Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Tue, 13 May 2025 18:58:01 +0900 Subject: [PATCH 2/3] =?UTF-8?q?fix=20:=20=EB=B2=88=EB=93=A4=EB=9F=AC=20?= =?UTF-8?q?=EC=8B=9C=EA=B0=81=ED=99=94=20=EB=AA=A8=EB=93=88=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80.=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EC=A4=91=EA=B0=84?= =?UTF-8?q?=EB=A7=88=EB=8B=A4=20DB=EC=99=80=EC=9D=98=20=EB=8F=99=EA=B8=B0?= =?UTF-8?q?=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/web/next.config.ts | 9 ++- apps/web/package.json | 1 + packages/crawling/src/crawlYoutube.ts | 53 +++++++------ packages/crawling/src/supabase/getDB.ts | 25 ++++-- pnpm-lock.yaml | 100 ++++++++++++++++++++++++ 5 files changed, 156 insertions(+), 32 deletions(-) diff --git a/apps/web/next.config.ts b/apps/web/next.config.ts index 5e891cf..c3c7c07 100644 --- a/apps/web/next.config.ts +++ b/apps/web/next.config.ts @@ -1,7 +1,14 @@ +import withBundleAnalyzer from '@next/bundle-analyzer'; import type { NextConfig } from 'next'; const nextConfig: NextConfig = { /* config options here */ }; -export default nextConfig; +const withBundle = withBundleAnalyzer({ + // enabled: process.env.ANALYZE === 'true', + enabled: true, + openAnalyzer: true, +}); + +export default withBundle(nextConfig); diff --git a/apps/web/package.json b/apps/web/package.json index f5ad910..a5b2d78 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -15,6 +15,7 @@ "@dnd-kit/modifiers": "^9.0.0", "@dnd-kit/sortable": "^10.0.0", "@dnd-kit/utilities": "^3.2.2", + "@next/bundle-analyzer": "^15.3.2", "@radix-ui/react-checkbox": "^1.1.5", "@radix-ui/react-dialog": "^1.1.6", "@radix-ui/react-dropdown-menu": "^2.1.6", diff --git a/packages/crawling/src/crawlYoutube.ts b/packages/crawling/src/crawlYoutube.ts index a055bec..0216f65 100644 --- a/packages/crawling/src/crawlYoutube.ts +++ b/packages/crawling/src/crawlYoutube.ts @@ -5,19 +5,19 @@ import { Song } from "./types"; import { updateDataLog } from "./logData"; import { updateKYDB } from "./supabase/updateDB"; -const successCase: Song[] = []; -const failedCase: Song[] = []; +const stackData: Song[] = []; +const totalData: Song[] = []; // process.on("SIGINT", async () => { // console.log("프로세스가 종료됩니다. 지금까지의 데이터를 업데이트 중..."); -// console.log("resultData : ", resultData.length); -// const result = await updateKYDB(resultData); +// console.log("stackData : ", stackData.length); +// const result = await updateKYDB(stackData); // console.log(result); // console.log("프로세스가 종료됩니다. 로그를 기록 중..."); // await Promise.all([ -// updateDataLog(successCase, "log/crawlYodutubeSuccess.txt"), +// updateDataLog(totalData, "log/crawlYodutubeSuccess.txt"), // updateDataLog(failedCase, "log/crawlYoutubeFailed.txt"), // ]); @@ -29,7 +29,7 @@ const page = await browser.newPage(); const baseUrl = "https://www.youtube.com/@KARAOKEKY/search"; -async function scrapeSongNumber(query: string) { +const scrapeSongNumber = async (query: string) => { const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`; // page.goto의 waitUntil 문제였음! @@ -43,9 +43,6 @@ async function scrapeSongNumber(query: string) { // id contents 의 첫번째 ytd-item-section-renderer 찾기 // const firstItem = $("#contents ytd-item-section-renderer").first(); - // await 안해도 해결! - // await page.waitForSelector("ytd-video-renderer"); - const firstItem = $("ytd-video-renderer").first(); // yt-formatted-string 찾기 @@ -53,44 +50,54 @@ async function scrapeSongNumber(query: string) { const karaokeNumber = extractKaraokeNumber(title); - // await browser.close(); - return karaokeNumber; -} +}; -function extractKaraokeNumber(title: string) { +const extractKaraokeNumber = (title: string) => { // KY. 찾고 ) 가 올때까지 찾기 const matchResult = title.match(/KY\.\s*(\d{2,5})\)/); const karaokeNumber = matchResult ? matchResult[1] : null; return karaokeNumber; -} +}; + +const refreshData = async () => { + console.log("refreshData!!!!!!!"); + const result = await updateKYDB(stackData); + updateDataLog(result.success, "log/crawlYoutubeSuccess.txt"); + updateDataLog(result.failed, "log/crawlYoutubeFailed.txt"); + + stackData.length = 0; // stackData 초기화 + console.log("refreshData END!!!!!!!"); +}; // 사용 const data = await getKYNULLDB(5000); console.log("getKYNULLDB : ", data.length); -const resultData: Song[] = []; let index = 0; for (const song of data) { + if (stackData.length > 100) { + refreshData(); + } const query = song.title + "-" + song.artist; console.log(song.title, " - ", song.artist); const result = await scrapeSongNumber(query); if (result) { console.log("success : ", result); - resultData.push({ ...song, num_ky: result }); - successCase.push(song); - } else { - failedCase.push(song); + stackData.push({ ...song, num_ky: result }); + totalData.push({ ...song, num_ky: result }); } index++; console.log("scrapeSongNumber : ", index); } -console.log("resultData : ", resultData.length); -const result = await updateKYDB(resultData); - -console.log(result); +console.log("totalData : ", totalData.length); +// const result = await updateKYDB(totalData); +const result = await updateKYDB(stackData); updateDataLog(result.success, "log/crawlYoutubeSuccess.txt"); updateDataLog(result.failed, "log/crawlYoutubeFailed.txt"); + +// 5.13 1차 시도 +// 5000개 중 3507개 성공 diff --git a/packages/crawling/src/supabase/getDB.ts b/packages/crawling/src/supabase/getDB.ts index fe2bed4..c3ca9d3 100644 --- a/packages/crawling/src/supabase/getDB.ts +++ b/packages/crawling/src/supabase/getDB.ts @@ -36,22 +36,31 @@ export async function getKYNULLDB(max: number = 50000) { const supabase = getClient(); // artist 정렬 + // const { data, error } = await supabase + // .from("songs") + // .select("id, title, artist, num_tj, num_ky") + // .order("title", { ascending: true }); + const { data, error } = await supabase .from("songs") .select("id, title, artist, num_tj, num_ky") - .order("title", { ascending: true }); + .is("num_ky", null) // num_ky가 null인 데이터만 가져옴 + .order("title", { ascending: true }) + .limit(max); // Supabase 쿼리 안에서의 한계를 넘을 수는 없음 if (error) throw error; console.log("data", data.length); - const isKYNULLData: Song[] = []; + return data; - data.forEach((song) => { - if (song.num_ky === null) { - isKYNULLData.push(song); - } - }); + // const isKYNULLData: Song[] = []; + + // data.forEach((song) => { + // if (song.num_ky === null) { + // isKYNULLData.push(song); + // } + // }); - return isKYNULLData.slice(0, max); + // return isKYNULLData.slice(0, max); } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 128524e..442c8a4 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -142,6 +142,9 @@ importers: '@dnd-kit/utilities': specifier: ^3.2.2 version: 3.2.2(react@19.1.0) + '@next/bundle-analyzer': + specifier: ^15.3.2 + version: 15.3.2 '@radix-ui/react-checkbox': specifier: ^1.1.5 version: 1.2.3(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -1160,6 +1163,10 @@ packages: resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} engines: {node: '>=12'} + '@discoveryjs/json-ext@0.5.7': + resolution: {integrity: sha512-dBVuXR082gk3jsFp7Rd/JI4kytwGHecnCoTtXFb7DB6CNHp4rg5k1bhg0nWdLGLnOV71lmDzGQaLMy8iPLY0pw==} + engines: {node: '>=10.0.0'} + '@dnd-kit/accessibility@3.1.1': resolution: {integrity: sha512-2P+YgaXF+gRsIihwwY1gCsQSYnu9Zyj2py8kY5fFvUM1qm2WA2u639R6YNVfU4GWr+ZM5mqEsfHZZLoRONbemw==} peerDependencies: @@ -1721,6 +1728,9 @@ packages: '@jridgewell/trace-mapping@0.3.9': resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} + '@next/bundle-analyzer@15.3.2': + resolution: {integrity: sha512-zY5O1PNKNxWEjaFX8gKzm77z2oL0cnj+m5aiqNBgay9LPLCDO13Cf+FJONeNq/nJjeXptwHFT9EMmTecF9U4Iw==} + '@next/env@15.2.2': resolution: {integrity: sha512-yWgopCfA9XDR8ZH3taB5nRKtKJ1Q5fYsTOuYkzIIoS8TJ0UAUKAGF73JnGszbjk2ufAQDj6mDdgsJAFx5CLtYQ==} @@ -1795,6 +1805,9 @@ packages: resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} + '@polka/url@1.0.0-next.29': + resolution: {integrity: sha512-wwQAWhWSuHaag8c4q/KN/vCoeOJYshAIvMQwD4GpSb3OiZklFfvAgmj0VCBBImRpuF/aFgIRzllXlVX93Jevww==} + '@puppeteer/browsers@2.10.2': resolution: {integrity: sha512-i4Ez+s9oRWQbNjtI/3+jxr7OH508mjAKvza0ekPJem0ZtmsYHP3B5dq62+IaBHKaGCOuqJxXzvFLUhJvQ6jtsQ==} engines: {node: '>=18'} @@ -3607,6 +3620,9 @@ packages: resolution: {integrity: sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==} engines: {node: '>= 0.4'} + debounce@1.2.1: + resolution: {integrity: sha512-XRRe6Glud4rd/ZGQfiV1ruXSfbvfJedlV9Y6zOlP+2K04vBYiJEte6stfFkCP03aMnY5tsipamumUjL14fofug==} + debug@2.6.9: resolution: {integrity: sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==} peerDependencies: @@ -3777,6 +3793,9 @@ packages: resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} engines: {node: '>= 0.4'} + duplexer@0.1.2: + resolution: {integrity: sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==} + eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} @@ -4454,6 +4473,10 @@ packages: graphemer@1.4.0: resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==} + gzip-size@6.0.0: + resolution: {integrity: sha512-ax7ZYomf6jqPTQ4+XCpUGyXKHk5WweS+e05MBO4/y3WJ5RkmPXNKvX+bx1behVILVwr6JSQvZAku021CHPXG3Q==} + engines: {node: '>=10'} + handlebars@4.7.8: resolution: {integrity: sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==} engines: {node: '>=0.4.7'} @@ -4745,6 +4768,10 @@ packages: resolution: {integrity: sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==} engines: {node: '>=0.10.0'} + is-plain-object@5.0.0: + resolution: {integrity: sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==} + engines: {node: '>=0.10.0'} + is-potential-custom-element-name@1.0.1: resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} @@ -5513,6 +5540,10 @@ packages: engines: {node: '>=10'} hasBin: true + mrmime@2.0.1: + resolution: {integrity: sha512-Y3wQdFg2Va6etvQ5I82yUhGdsKrcYox6p7FfL1LbK2J4V01F9TGlepTIhnK24t7koZibmg82KGglhA1XK5IsLQ==} + engines: {node: '>=10'} + ms@2.0.0: resolution: {integrity: sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==} @@ -5724,6 +5755,10 @@ packages: zod: optional: true + opener@1.5.2: + resolution: {integrity: sha512-ur5UIdyw5Y7yEj9wLzhqXiy6GZ3Mwx0yGI+5sMn2r0N0v3cKJvUmFH5yPP+WXh9e0xfyzyJX95D8l088DNFj7A==} + hasBin: true + optionator@0.9.4: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} @@ -6589,6 +6624,10 @@ packages: simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} + sirv@2.0.4: + resolution: {integrity: sha512-94Bdh3cC2PKrbgSOUqTiGPWVZeSiXfKOVZNJniWoqrWrRkB1CJzBU3NEbiTsPcYy1lDsANA/THzS+9WBiy5nfQ==} + engines: {node: '>= 10'} + sisteransi@1.0.5: resolution: {integrity: sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==} @@ -6943,6 +6982,10 @@ packages: resolution: {integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==} engines: {node: '>=0.6'} + totalist@3.0.1: + resolution: {integrity: sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ==} + engines: {node: '>=6'} + tough-cookie@4.1.4: resolution: {integrity: sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==} engines: {node: '>=6'} @@ -7307,6 +7350,11 @@ packages: resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==} engines: {node: '>=12'} + webpack-bundle-analyzer@4.10.1: + resolution: {integrity: sha512-s3P7pgexgT/HTUSYgxJyn28A+99mmLq4HsJepMPzu0R8ImJc52QNqaFYW1Z2z2uIb1/J3eYgaAWVpaC+v/1aAQ==} + engines: {node: '>= 10.13.0'} + hasBin: true + webpack-sources@3.2.3: resolution: {integrity: sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==} engines: {node: '>=10.13.0'} @@ -8439,6 +8487,8 @@ snapshots: dependencies: '@jridgewell/trace-mapping': 0.3.9 + '@discoveryjs/json-ext@0.5.7': {} + '@dnd-kit/accessibility@3.1.1(react@19.1.0)': dependencies: react: 19.1.0 @@ -9212,6 +9262,13 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.0 + '@next/bundle-analyzer@15.3.2': + dependencies: + webpack-bundle-analyzer: 4.10.1 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + '@next/env@15.2.2': {} '@next/eslint-plugin-next@15.3.1': @@ -9261,6 +9318,8 @@ snapshots: '@pkgjs/parseargs@0.11.0': optional: true + '@polka/url@1.0.0-next.29': {} + '@puppeteer/browsers@2.10.2': dependencies: debug: 4.4.0 @@ -11353,6 +11412,8 @@ snapshots: es-errors: 1.3.0 is-data-view: 1.0.2 + debounce@1.2.1: {} + debug@2.6.9: dependencies: ms: 2.0.0 @@ -11498,6 +11559,8 @@ snapshots: es-errors: 1.3.0 gopd: 1.2.0 + duplexer@0.1.2: {} + eastasianwidth@0.2.0: {} ee-first@1.1.1: {} @@ -12378,6 +12441,10 @@ snapshots: graphemer@1.4.0: {} + gzip-size@6.0.0: + dependencies: + duplexer: 0.1.2 + handlebars@4.7.8: dependencies: minimist: 1.2.8 @@ -12697,6 +12764,8 @@ snapshots: dependencies: isobject: 3.0.1 + is-plain-object@5.0.0: {} + is-potential-custom-element-name@1.0.1: {} is-regex@1.2.1: @@ -13767,6 +13836,8 @@ snapshots: mkdirp@1.0.4: {} + mrmime@2.0.1: {} + ms@2.0.0: {} ms@2.1.3: {} @@ -13976,6 +14047,8 @@ snapshots: transitivePeerDependencies: - encoding + opener@1.5.2: {} + optionator@0.9.4: dependencies: deep-is: 0.1.4 @@ -14981,6 +15054,12 @@ snapshots: dependencies: is-arrayish: 0.3.2 + sirv@2.0.4: + dependencies: + '@polka/url': 1.0.0-next.29 + mrmime: 2.0.1 + totalist: 3.0.1 + sisteransi@1.0.5: {} slash@3.0.0: {} @@ -15349,6 +15428,8 @@ snapshots: toidentifier@1.0.1: {} + totalist@3.0.1: {} + tough-cookie@4.1.4: dependencies: psl: 1.15.0 @@ -15700,6 +15781,25 @@ snapshots: webidl-conversions@7.0.0: {} + webpack-bundle-analyzer@4.10.1: + dependencies: + '@discoveryjs/json-ext': 0.5.7 + acorn: 8.14.1 + acorn-walk: 8.3.4 + commander: 7.2.0 + debounce: 1.2.1 + escape-string-regexp: 4.0.0 + gzip-size: 6.0.0 + html-escaper: 2.0.2 + is-plain-object: 5.0.0 + opener: 1.5.2 + picocolors: 1.1.1 + sirv: 2.0.4 + ws: 7.5.10 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + webpack-sources@3.2.3: {} webpack@5.99.7: From 99fd94d20813e325973544bfd28e60680185cadd Mon Sep 17 00:00:00 2001 From: GulSam00 Date: Wed, 14 May 2025 21:03:28 +0900 Subject: [PATCH 3/3] =?UTF-8?q?fix=20:=20puppeteer=20timeout=20=EB=AC=B4?= =?UTF-8?q?=ED=9A=A8=ED=99=94.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/crawling/src/crawlYoutube.ts | 11 +++++++---- packages/crawling/src/supabase/getDB.ts | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/crawling/src/crawlYoutube.ts b/packages/crawling/src/crawlYoutube.ts index 0216f65..0ed4f6c 100644 --- a/packages/crawling/src/crawlYoutube.ts +++ b/packages/crawling/src/crawlYoutube.ts @@ -35,6 +35,7 @@ const scrapeSongNumber = async (query: string) => { // page.goto의 waitUntil 문제였음! await page.goto(searchUrl, { waitUntil: "networkidle2", + timeout: 0, }); const html = await page.content(); @@ -61,18 +62,17 @@ const extractKaraokeNumber = (title: string) => { }; const refreshData = async () => { - console.log("refreshData!!!!!!!"); + console.log("refreshData"); const result = await updateKYDB(stackData); updateDataLog(result.success, "log/crawlYoutubeSuccess.txt"); updateDataLog(result.failed, "log/crawlYoutubeFailed.txt"); stackData.length = 0; // stackData 초기화 - console.log("refreshData END!!!!!!!"); }; // 사용 -const data = await getKYNULLDB(5000); +const data = await getKYNULLDB(); console.log("getKYNULLDB : ", data.length); let index = 0; @@ -90,6 +90,7 @@ for (const song of data) { } index++; console.log("scrapeSongNumber : ", index); + console.log("stackData : ", stackData.length); } console.log("totalData : ", totalData.length); @@ -100,4 +101,6 @@ updateDataLog(result.success, "log/crawlYoutubeSuccess.txt"); updateDataLog(result.failed, "log/crawlYoutubeFailed.txt"); // 5.13 1차 시도 -// 5000개 중 3507개 성공 +// 5000개 중 3507개 성공, 총 18906개 등록 + +// 5.13 2차 시도 diff --git a/packages/crawling/src/supabase/getDB.ts b/packages/crawling/src/supabase/getDB.ts index c3ca9d3..78dc841 100644 --- a/packages/crawling/src/supabase/getDB.ts +++ b/packages/crawling/src/supabase/getDB.ts @@ -45,7 +45,7 @@ export async function getKYNULLDB(max: number = 50000) { .from("songs") .select("id, title, artist, num_tj, num_ky") .is("num_ky", null) // num_ky가 null인 데이터만 가져옴 - .order("title", { ascending: true }) + .order("title", { ascending: false }) .limit(max); // Supabase 쿼리 안에서의 한계를 넘을 수는 없음 if (error) throw error;